2 * Contributed by Richard Henderson (rth@tamu.edu)
4 * Copy a null-terminated string from SRC to DST.
6 * This is an internal routine used by strcpy, stpcpy, and strcat.
7 * As such, it uses special linkage conventions to make implementation
8 * of these public functions more efficient.
16 * t12 = bitmask (with one bit set) indicating the last byte written
17 * a0 = unaligned address of the last *word* written
19 * Furthermore, v0, a3-a5, t11, and t12 are untouched.
22 #include <alpha/regdef.h>
29 /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
30 doesn't like putting the entry point for a procedure somewhere in the
31 middle of the procedure descriptor. Work around this by putting the
32 aligned copy in its own procedure descriptor */
40 /* On entry to this basic block:
41 t0 == the first destination word for masking back in
42 t1 == the first source word. */
44 /* Create the 1st output word and detect 0's in the 1st input word. */
45 lda t2, -1 # e1 : build a mask against false zero
46 mskqh t2, a1, t2 # e0 : detection in the src word
47 mskqh t1, a1, t3 # e0 :
48 ornot t1, t2, t2 # .. e1 :
49 mskql t0, a1, t0 # e0 : assemble the first output word
50 cmpbge zero, t2, t8 # .. e1 : bits set iff null found
52 bne t8, $a_eos # .. e1 :
54 /* On entry to this basic block:
55 t0 == the first destination word for masking back in
56 t1 == a source word not containing a null. */
59 stq_u t1, 0(a0) # e0 :
60 addq a0, 8, a0 # .. e1 :
61 ldq_u t1, 0(a1) # e0 :
62 addq a1, 8, a1 # .. e1 :
63 cmpbge zero, t1, t8 # e0 (stall)
64 beq t8, $a_loop # .. e1 (zdb)
66 /* Take care of the final (partial) word store.
67 On entry to this basic block we have:
68 t1 == the source word containing the null
69 t8 == the cmpbge mask that found it. */
71 negq t8, t6 # e0 : find low bit set
72 and t8, t6, t12 # e1 (stall)
74 /* For the sake of the cache, don't read a destination word
75 if we're not going to need it. */
76 and t12, 0x80, t6 # e0 :
77 bne t6, 1f # .. e1 (zdb)
79 /* We're doing a partial word store and so need to combine
80 our source and original destination words. */
81 ldq_u t0, 0(a0) # e0 :
82 subq t12, 1, t6 # .. e1 :
83 zapnot t1, t6, t1 # e0 : clear src bytes >= null
84 or t12, t6, t8 # .. e1 :
85 zap t0, t8, t0 # e0 : clear dst bytes <= null
88 1: stq_u t1, 0(a0) # e0 :
100 /* Are source and destination co-aligned? */
101 xor a0, a1, t0 # e0 :
104 bne t0, $unaligned # .. e1 :
106 /* We are co-aligned; take care of a partial first word. */
107 ldq_u t1, 0(a1) # e0 : load first src word
108 and a0, 7, t0 # .. e1 : take care not to load a word ...
109 addq a1, 8, a1 # e0 :
110 beq t0, stxcpy_aligned # .. e1 : ... if we wont need it
111 ldq_u t0, 0(a0) # e0 :
112 br stxcpy_aligned # .. e1 :
115 /* The source and destination are not co-aligned. Align the destination
116 and cope. We have to be very careful about not reading too much and
121 /* We know just enough now to be able to assemble the first
122 full source word. We can still find a zero at the end of it
123 that prevents us from outputting the whole thing.
125 On entry to this basic block:
126 t0 == the first dest word, for masking back in, if needed else 0
127 t1 == the low bits of the first source word
128 t6 == bytemask that is -1 in dest word bytes */
130 ldq_u t2, 8(a1) # e0 :
131 addq a1, 8, a1 # .. e1 :
133 extql t1, a1, t1 # e0 :
134 extqh t2, a1, t4 # e0 :
135 mskql t0, a0, t0 # e0 :
136 or t1, t4, t1 # .. e1 :
137 mskqh t1, a0, t1 # e0 :
141 cmpbge zero, t6, t8 # .. e1 :
142 lda t6, -1 # e0 : for masking just below
143 bne t8, $u_final # .. e1 :
145 mskql t6, a1, t6 # e0 : mask out the bits we have
146 or t6, t2, t2 # e1 : already extracted before
147 cmpbge zero, t2, t8 # e0 : testing eos
148 bne t8, $u_late_head_exit # .. e1 (zdb)
150 /* Finally, we've got all the stupid leading edge cases taken care
151 of and we can set up to enter the main loop. */
153 stq_u t1, 0(a0) # e0 : store first output word
154 addq a0, 8, a0 # .. e1 :
155 extql t2, a1, t0 # e0 : position ho-bits of lo word
156 ldq_u t2, 8(a1) # .. e1 : read next high-order source word
157 addq a1, 8, a1 # e0 :
158 cmpbge zero, t2, t8 # .. e1 :
160 bne t8, $u_eos # .. e1 :
162 /* Unaligned copy main loop. In order to avoid reading too much,
163 the loop is structured to detect zeros in aligned source words.
164 This has, unfortunately, effectively pulled half of a loop
165 iteration out into the head and half into the tail, but it does
166 prevent nastiness from accumulating in the very thing we want
167 to run as fast as possible.
169 On entry to this basic block:
170 t0 == the shifted high-order bits from the previous source word
171 t2 == the unshifted current source word
173 We further know that t2 does not contain a null terminator. */
177 extqh t2, a1, t1 # e0 : extract high bits for current word
178 addq a1, 8, a1 # .. e1 :
179 extql t2, a1, t3 # e0 : extract low bits for next time
180 addq a0, 8, a0 # .. e1 :
181 or t0, t1, t1 # e0 : current dst word now complete
182 ldq_u t2, 0(a1) # .. e1 : load high word for next time
183 stq_u t1, -8(a0) # e0 : save the current word
185 cmpbge zero, t2, t8 # e0 : test new word for eos
186 beq t8, $u_loop # .. e1 :
188 /* We've found a zero somewhere in the source word we just read.
189 If it resides in the lower half, we have one (probably partial)
190 word to write out, and if it resides in the upper half, we
191 have one full and one partial word left to write out.
193 On entry to this basic block:
194 t0 == the shifted high-order bits from the previous source word
195 t2 == the unshifted current source word. */
197 extqh t2, a1, t1 # e0 :
198 or t0, t1, t1 # e1 : first (partial) source word complete
200 cmpbge zero, t1, t8 # e0 : is the null in this first bit?
201 bne t8, $u_final # .. e1 (zdb)
204 stq_u t1, 0(a0) # e0 : the null was in the high-order bits
205 addq a0, 8, a0 # .. e1 :
206 extql t2, a1, t1 # e0 :
207 cmpbge zero, t1, t8 # .. e1 :
209 /* Take care of a final (probably partial) result word.
210 On entry to this basic block:
211 t1 == assembled source word
212 t8 == cmpbge mask that found the null. */
214 negq t8, t6 # e0 : isolate low bit set
215 and t6, t8, t12 # e1 :
217 and t12, 0x80, t6 # e0 : avoid dest word load if we can
218 bne t6, 1f # .. e1 (zdb)
220 ldq_u t0, 0(a0) # e0 :
221 subq t12, 1, t6 # .. e1 :
222 or t6, t12, t8 # e0 :
223 zapnot t1, t6, t1 # .. e1 : kill source bytes >= null
224 zap t0, t8, t0 # e0 : kill dest bytes <= null
227 1: stq_u t1, 0(a0) # e0 :
230 /* Unaligned copy entry point. */
234 ldq_u t1, 0(a1) # e0 : load first source word
236 and a0, 7, t4 # .. e1 : find dest misalignment
237 and a1, 7, t5 # e0 : find src misalignment
239 /* Conditionally load the first destination word and a bytemask
240 with 0xff indicating that the destination byte is sacrosanct. */
242 mov zero, t0 # .. e1 :
245 ldq_u t0, 0(a0) # e0 :
247 mskql t6, a0, t6 # e0 :
249 subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr
251 /* If source misalignment is larger than dest misalignment, we need
252 extra startup checks to avoid SEGV. */
254 cmplt t4, t5, t12 # e0 :
255 beq t12, $u_head # .. e1 (zdb)
257 lda t2, -1 # e1 : mask out leading garbage in source
258 mskqh t2, t5, t2 # e0 :
260 ornot t1, t2, t3 # .. e1 :
261 cmpbge zero, t3, t8 # e0 : is there a zero?
262 beq t8, $u_head # .. e1 (zdb)
264 /* At this point we've found a zero in the first partial word of
265 the source. We need to isolate the valid source data and mask
266 it into the original destination data. (Incidentally, we know
267 that we'll need at least one byte of that original dest word.) */
269 ldq_u t0, 0(a0) # e0 :
271 negq t8, t6 # .. e1 : build bitmask of bytes <= zero
272 and t6, t8, t12 # e0 :
273 and a1, 7, t5 # .. e1 :
274 subq t12, 1, t6 # e0 :
275 or t6, t12, t8 # e1 :
276 srl t12, t5, t12 # e0 : adjust final null return value
278 zapnot t2, t8, t2 # .. e1 : prepare source word; mirror changes
279 and t1, t2, t1 # e1 : to source validity mask
280 extql t2, a1, t2 # .. e0 :
281 extql t1, a1, t1 # e0 :
283 andnot t0, t2, t0 # .. e1 : zero place for source to reside
284 or t0, t1, t1 # e1 : and put it there
285 stq_u t1, 0(a0) # .. e0 :