sysdeps/alpha/stxncpy.S

   1 /* Copyright (C) 1996, 1997 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson (rth@tamu.edu)
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 /* Copy no more than COUNT bytes of the null-terminated string from
  21    SRC to DST.
  22
  23    This is an internal routine used by strncpy, stpncpy, and strncat.
  24    As such, it uses special linkage conventions to make implementation
  25    of these public functions more efficient.
  26
  27    On input:
  28         t9 = return address
  29         a0 = DST
  30         a1 = SRC
  31         a2 = COUNT
  32
  33    Furthermore, COUNT may not be zero.
  34
  35    On output:
  36         t0  = last word written
  37         t8  = bitmask (with one bit set) indicating the last byte written
  38         t10 = bitmask (with one bit set) indicating the byte position of
  39               the end of the range specified by COUNT
  40         a0  = unaligned address of the last *word* written
  41         a2  = the number of full words left in COUNT
  42
  43    Furthermore, v0, a3-a5, t11, and t12 are untouched.
  44 */
  45
  46
  47 /* This is generally scheduled for the EV5, but should still be pretty
  48    good for the EV4 too.  */
  49
  50 #include <sysdep.h>
  51
  52         .set noat
  53         .set noreorder
  54
  55         .text
  56
  57 /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
  58    doesn't like putting the entry point for a procedure somewhere in the
  59    middle of the procedure descriptor.  Work around this by putting the
  60    aligned copy in its own procedure descriptor */
  61
  62         .ent stxncpy_aligned
  63         .align 3
  64 stxncpy_aligned:
  65         .frame sp, 0, t9, 0
  66         .prologue 0
  67
  68         /* On entry to this basic block:
  69            t0 == the first destination word for masking back in
  70            t1 == the first source word.  */
  71
  72         /* Create the 1st output word and detect 0's in the 1st input word.  */
  73         lda     t2, -1          # e1    : build a mask against false zero
  74         mskqh   t2, a1, t2      # e0    :   detection in the src word
  75         mskqh   t1, a1, t3      # e0    :
  76         ornot   t1, t2, t2      # .. e1 :
  77         mskql   t0, a1, t0      # e0    : assemble the first output word
  78         cmpbge  zero, t2, t7    # .. e1 : bits set iff null found
  79         or      t0, t3, t0      # e0    :
  80         beq     a2, $a_eoc      # .. e1 :
  81         bne     t7, $a_eos      # .. e1 :
  82
  83         /* On entry to this basic block:
  84            t0 == a source word not containing a null.  */
  85
  86 $a_loop:
  87         stq_u   t0, 0(a0)       # e0    :
  88         addq    a0, 8, a0       # .. e1 :
  89         ldq_u   t0, 0(a1)       # e0    :
  90         addq    a1, 8, a1       # .. e1 :
  91         subq    a2, 1, a2       # e0    :
  92         cmpbge  zero, t0, t7    # .. e1 (stall)
  93         beq     a2, $a_eoc      # e1    :
  94         beq     t7, $a_loop     # e1    :
  95
  96         /* Take care of the final (partial) word store.  At this point
  97            the end-of-count bit is set in t7 iff it applies.
  98
  99            On entry to this basic block we have:
 100            t0 == the source word containing the null
 101            t7 == the cmpbge mask that found it.  */
 102
 103 $a_eos:
 104         negq    t7, t8          # e0    : find low bit set
 105         and     t7, t8, t8      # e1 (stall)
 106
 107         /* For the sake of the cache, don't read a destination word
 108            if we're not going to need it.  */
 109         and     t8, 0x80, t6    # e0    :
 110         bne     t6, 1f          # .. e1 (zdb)
 111
 112         /* We're doing a partial word store and so need to combine
 113            our source and original destination words.  */
 114         ldq_u   t1, 0(a0)       # e0    :
 115         subq    t8, 1, t6       # .. e1 :
 116         or      t8, t6, t7      # e0    :
 117         unop                    #
 118         zapnot  t0, t7, t0      # e0    : clear src bytes > null
 119         zap     t1, t7, t1      # .. e1 : clear dst bytes <= null
 120         or      t0, t1, t0      # e1    :
 121
 122 1:      stq_u   t0, 0(a0)       # e0    :
 123         ret     (t9)            # e1    :
 124
 125         /* Add the end-of-count bit to the eos detection bitmask.  */
 126 $a_eoc:
 127         or      t10, t7, t7
 128         br      $a_eos
 129
 130         .end stxncpy_aligned
 131
 132         .align 3
 133         .ent __stxncpy
 134         .globl __stxncpy
 135 __stxncpy:
 136         .frame sp, 0, t9, 0
 137         .prologue 0
 138
 139         /* Are source and destination co-aligned?  */
 140         xor     a0, a1, t1      # e0    :
 141         and     a0, 7, t0       # .. e1 : find dest misalignment
 142         and     t1, 7, t1       # e0    :
 143         addq    a2, t0, a2      # .. e1 : bias count by dest misalignment
 144         subq    a2, 1, a2       # e0    :
 145         and     a2, 7, t2       # e1    :
 146         srl     a2, 3, a2       # e0    : a2 = loop counter = (count - 1)/8
 147         addq    zero, 1, t10    # .. e1 :
 148         sll     t10, t2, t10    # e0    : t10 = bitmask of last count byte
 149         bne     t1, $unaligned  # .. e1 :
 150
 151         /* We are co-aligned; take care of a partial first word.  */
 152
 153         ldq_u   t1, 0(a1)       # e0    : load first src word
 154         addq    a1, 8, a1       # .. e1 :
 155
 156         beq     t0, stxncpy_aligned     # avoid loading dest word if not needed
 157         ldq_u   t0, 0(a0)       # e0    :
 158         br      stxncpy_aligned # .. e1 :
 159
 160
 161 /* The source and destination are not co-aligned.  Align the destination
 162    and cope.  We have to be very careful about not reading too much and
 163    causing a SEGV.  */
 164
 165         .align 3
 166 $u_head:
 167         /* We know just enough now to be able to assemble the first
 168            full source word.  We can still find a zero at the end of it
 169            that prevents us from outputting the whole thing.
 170
 171            On entry to this basic block:
 172            t0 == the first dest word, unmasked
 173            t1 == the shifted low bits of the first source word
 174            t6 == bytemask that is -1 in dest word bytes */
 175
 176         ldq_u   t2, 8(a1)       # e0    : load second src word
 177         addq    a1, 8, a1       # .. e1 :
 178         mskql   t0, a0, t0      # e0    : mask trailing garbage in dst
 179         extqh   t2, a1, t4      # e0    :
 180         or      t1, t4, t1      # e1    : first aligned src word complete
 181         mskqh   t1, a0, t1      # e0    : mask leading garbage in src
 182         or      t0, t1, t0      # e0    : first output word complete
 183         or      t0, t6, t6      # e1    : mask original data for zero test
 184         cmpbge  zero, t6, t7    # e0    :
 185         beq     a2, $u_eocfin   # .. e1 :
 186         bne     t7, $u_final    # e1    :
 187
 188         lda     t6, -1                  # e1    : mask out the bits we have
 189         mskql   t6, a1, t6              # e0    :   already seen
 190         stq_u   t0, 0(a0)               # e0    : store first output word
 191         or      t6, t2, t2              # .. e1 :
 192         cmpbge  zero, t2, t7            # e0    : find nulls in second partial
 193         addq    a0, 8, a0               # .. e1 :
 194         subq    a2, 1, a2               # e0    :
 195         bne     t7, $u_late_head_exit   # .. e1 :
 196
 197         /* Finally, we've got all the stupid leading edge cases taken care
 198            of and we can set up to enter the main loop.  */
 199
 200         extql   t2, a1, t1      # e0    : position hi-bits of lo word
 201         ldq_u   t2, 8(a1)       # .. e1 : read next high-order source word
 202         addq    a1, 8, a1       # e0    :
 203         cmpbge  zero, t2, t7    # e1 (stall)
 204         beq     a2, $u_eoc      # e1    :
 205         bne     t7, $u_eos      # e1    :
 206
 207         /* Unaligned copy main loop.  In order to avoid reading too much,
 208            the loop is structured to detect zeros in aligned source words.
 209            This has, unfortunately, effectively pulled half of a loop
 210            iteration out into the head and half into the tail, but it does
 211            prevent nastiness from accumulating in the very thing we want
 212            to run as fast as possible.
 213
 214            On entry to this basic block:
 215            t1 == the shifted high-order bits from the previous source word
 216            t2 == the unshifted current source word
 217
 218            We further know that t2 does not contain a null terminator.  */
 219
 220         .align 3
 221 $u_loop:
 222         extqh   t2, a1, t0      # e0    : extract high bits for current word
 223         addq    a1, 8, a1       # .. e1 :
 224         extql   t2, a1, t3      # e0    : extract low bits for next time
 225         addq    a0, 8, a0       # .. e1 :
 226         or      t0, t1, t0      # e0    : current dst word now complete
 227         ldq_u   t2, 0(a1)       # .. e1 : load high word for next time
 228         stq_u   t0, -8(a0)      # e0    : save the current word
 229         mov     t3, t1          # .. e1 :
 230         subq    a2, 1, a2       # e0    :
 231         cmpbge  zero, t2, t7    # .. e1 : test new word for eos
 232         beq     a2, $u_eoc      # e1    :
 233         beq     t7, $u_loop     # e1    :
 234
 235         /* We've found a zero somewhere in the source word we just read.
 236            If it resides in the lower half, we have one (probably partial)
 237            word to write out, and if it resides in the upper half, we
 238            have one full and one partial word left to write out.
 239
 240            On entry to this basic block:
 241            t1 == the shifted high-order bits from the previous source word
 242            t2 == the unshifted current source word.  */
 243 $u_eos:
 244         extqh   t2, a1, t0      # e0    :
 245         or      t0, t1, t0      # e1    : first (partial) source word complete
 246
 247         cmpbge  zero, t0, t7    # e0    : is the null in this first bit?
 248         bne     t7, $u_final    # .. e1 (zdb)
 249
 250         stq_u   t0, 0(a0)       # e0    : the null was in the high-order bits
 251         addq    a0, 8, a0       # .. e1 :
 252         subq    a2, 1, a2       # e1    :
 253
 254 $u_late_head_exit:
 255         extql   t2, a1, t0      # .. e0 :
 256         cmpbge  zero, t0, t7    # e0    :
 257         or      t7, t10, t6     # e1    :
 258         cmoveq  a2, t6, t7      # e0    :
 259         nop                     # .. e1 :
 260
 261         /* Take care of a final (probably partial) result word.
 262            On entry to this basic block:
 263            t0 == assembled source word
 264            t7 == cmpbge mask that found the null.  */
 265 $u_final:
 266         negq    t7, t6          # e0    : isolate low bit set
 267         and     t6, t7, t8      # e1    :
 268
 269         and     t8, 0x80, t6    # e0    : avoid dest word load if we can
 270         bne     t6, 1f          # .. e1 (zdb)
 271
 272         ldq_u   t1, 0(a0)       # e0    :
 273         subq    t8, 1, t6       # .. e1 :
 274         or      t6, t8, t7      # e0    :
 275         zapnot  t0, t7, t0      # .. e1 : kill source bytes > null
 276         zap     t1, t7, t1      # e0    : kill dest bytes <= null
 277         or      t0, t1, t0      # e1    :
 278
 279 1:      stq_u   t0, 0(a0)       # e0    :
 280         ret     (t9)            # .. e1 :
 281
 282 $u_eoc:                         # end-of-count
 283         extqh   t2, a1, t0
 284         or      t0, t1, t0
 285         cmpbge  zero, t0, t7
 286
 287 $u_eocfin:                      # end-of-count, final word
 288         or      t10, t7, t7
 289         br      $u_final
 290
 291         /* Unaligned copy entry point.  */
 292         .align 3
 293 $unaligned:
 294
 295         ldq_u   t1, 0(a1)       # e0    : load first source word
 296
 297         and     a0, 7, t4       # .. e1 : find dest misalignment
 298         and     a1, 7, t5       # e0    : find src misalignment
 299
 300         /* Conditionally load the first destination word and a bytemask
 301            with 0xff indicating that the destination byte is sacrosanct.  */
 302
 303         mov     zero, t0        # .. e1 :
 304         mov     zero, t6        # e0    :
 305         beq     t4, 1f          # .. e1 :
 306         ldq_u   t0, 0(a0)       # e0    :
 307         lda     t6, -1          # .. e1 :
 308         mskql   t6, a0, t6      # e0    :
 309 1:
 310         subq    a1, t4, a1      # .. e1 : sub dest misalignment from src addr
 311
 312         /* If source misalignment is larger than dest misalignment, we need
 313            extra startup checks to avoid SEGV.  */
 314
 315         cmplt   t4, t5, t8      # e1    :
 316         extql   t1, a1, t1      # .. e0 : shift src into place
 317         lda     t2, -1          # e0    : for creating masks later
 318         beq     t8, $u_head     # e1    :
 319
 320         mskqh   t2, t5, t2      # e0    : begin src byte validity mask
 321         cmpbge  zero, t1, t7    # .. e1 : is there a zero?
 322         extql   t2, a1, t2      # e0    :
 323         or      t7, t10, t5     # .. e1 : test for end-of-count too
 324         cmpbge  zero, t2, t3    # e0    :
 325         cmoveq  a2, t5, t7      # .. e1 :
 326         andnot  t7, t3, t7      # e0    :
 327         beq     t7, $u_head     # .. e1 (zdb)
 328
 329         /* At this point we've found a zero in the first partial word of
 330            the source.  We need to isolate the valid source data and mask
 331            it into the original destination data.  (Incidentally, we know
 332            that we'll need at least one byte of that original dest word.) */
 333
 334         ldq_u   t0, 0(a0)       # e0    :
 335         negq    t7, t6          # .. e1 : build bitmask of bytes <= zero
 336         mskqh   t1, t4, t1      # e0    :
 337         and     t6, t7, t8      # .. e1 :
 338         subq    t8, 1, t6       # e0    :
 339         or      t6, t8, t7      # e1    :
 340
 341         zapnot  t2, t7, t2      # e0    : prepare source word; mirror changes
 342         zapnot  t1, t7, t1      # .. e1 : to source validity mask
 343
 344         andnot  t0, t2, t0      # e0    : zero place for source to reside
 345         or      t0, t1, t0      # e1    : and put it there
 346         stq_u   t0, 0(a0)       # e0    :
 347         ret     (t9)            # .. e1 :
 348
 349         .end __stxncpy