2 * arch/alpha/lib/ev6-stxcpy.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
5 * Copy a null-terminated string from SRC to DST.
7 * This is an internal routine used by strcpy, stpcpy, and strcat.
8 * As such, it uses special linkage conventions to make implementation
9 * of these public functions more efficient.
17 * t12 = bitmask (with one bit set) indicating the last byte written
18 * a0 = unaligned address of the last *word* written
20 * Furthermore, v0, a3-a5, t11, and t12 are untouched.
22 * Much of the information about 21264 scheduling/coding comes from:
23 * Compiler Writer's Guide for the Alpha 21264
24 * abbreviated as 'CWG' in other comments here
25 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
26 * Scheduling notation:
28 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
29 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
30 * Try not to change the actual algorithm if possible for consistency.
33 #include <asm/regdef.h>
48 /* On entry to this basic block:
49 t0 == the first destination word for masking back in
50 t1 == the first source word. */
52 /* Create the 1st output word and detect 0's in the 1st input word. */
53 lda t2, -1 # E : build a mask against false zero
54 mskqh t2, a1, t2 # U : detection in the src word (stall)
55 mskqh t1, a1, t3 # U :
56 ornot t1, t2, t2 # E : (stall)
58 mskql t0, a1, t0 # U : assemble the first output word
59 cmpbge zero, t2, t8 # E : bits set iff null found
60 or t0, t3, t1 # E : (stall)
61 bne t8, $a_eos # U : (stall)
63 /* On entry to this basic block:
64 t0 == the first destination word for masking back in
65 t1 == a source word not containing a null. */
66 /* Nops here to separate store quads from load quads */
74 ldq_u t1, 0(a1) # L : Latency=3
76 cmpbge zero, t1, t8 # E : (3 cycle stall)
77 beq t8, $a_loop # U : (stall for t8)
79 /* Take care of the final (partial) word store.
80 On entry to this basic block we have:
81 t1 == the source word containing the null
82 t8 == the cmpbge mask that found it. */
84 negq t8, t6 # E : find low bit set
85 and t8, t6, t12 # E : (stall)
86 /* For the sake of the cache, don't read a destination word
87 if we're not going to need it. */
88 and t12, 0x80, t6 # E : (stall)
89 bne t6, 1f # U : (stall)
91 /* We're doing a partial word store and so need to combine
92 our source and original destination words. */
93 ldq_u t0, 0(a0) # L : Latency=3
95 zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
96 or t12, t6, t8 # E : (stall)
98 zap t0, t8, t0 # E : clear dst bytes <= null
99 or t0, t1, t1 # E : (stall)
103 1: stq_u t1, 0(a0) # L :
104 ret (t9) # L0 : Latency=3
117 /* Are source and destination co-aligned? */
120 and t0, 7, t0 # E : (stall)
121 bne t0, $unaligned # U : (stall)
123 /* We are co-aligned; take care of a partial first word. */
124 ldq_u t1, 0(a1) # L : load first src word
125 and a0, 7, t0 # E : take care not to load a word ...
127 beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
129 ldq_u t0, 0(a0) # L :
130 br stxcpy_aligned # L0 : Latency=3
135 /* The source and destination are not co-aligned. Align the destination
136 and cope. We have to be very careful about not reading too much and
141 /* We know just enough now to be able to assemble the first
142 full source word. We can still find a zero at the end of it
143 that prevents us from outputting the whole thing.
145 On entry to this basic block:
146 t0 == the first dest word, for masking back in, if needed else 0
147 t1 == the low bits of the first source word
148 t6 == bytemask that is -1 in dest word bytes */
150 ldq_u t2, 8(a1) # L :
152 extql t1, a1, t1 # U : (stall on a1)
153 extqh t2, a1, t4 # U : (stall on a1)
155 mskql t0, a0, t0 # U :
157 mskqh t1, a0, t1 # U : (stall on t1)
158 or t0, t1, t1 # E : (stall on t1)
161 cmpbge zero, t6, t8 # E : (stall)
162 lda t6, -1 # E : for masking just below
163 bne t8, $u_final # U : (stall)
165 mskql t6, a1, t6 # U : mask out the bits we have
166 or t6, t2, t2 # E : already extracted before (stall)
167 cmpbge zero, t2, t8 # E : testing eos (stall)
168 bne t8, $u_late_head_exit # U : (stall)
170 /* Finally, we've got all the stupid leading edge cases taken care
171 of and we can set up to enter the main loop. */
173 stq_u t1, 0(a0) # L : store first output word
175 extql t2, a1, t0 # U : position ho-bits of lo word
176 ldq_u t2, 8(a1) # U : read next high-order source word
179 cmpbge zero, t2, t8 # E : (stall for t2)
181 bne t8, $u_eos # U : (stall)
183 /* Unaligned copy main loop. In order to avoid reading too much,
184 the loop is structured to detect zeros in aligned source words.
185 This has, unfortunately, effectively pulled half of a loop
186 iteration out into the head and half into the tail, but it does
187 prevent nastiness from accumulating in the very thing we want
188 to run as fast as possible.
190 On entry to this basic block:
191 t0 == the shifted high-order bits from the previous source word
192 t2 == the unshifted current source word
194 We further know that t2 does not contain a null terminator. */
198 extqh t2, a1, t1 # U : extract high bits for current word
199 addq a1, 8, a1 # E : (stall)
200 extql t2, a1, t3 # U : extract low bits for next time (stall)
203 or t0, t1, t1 # E : current dst word now complete
204 ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
205 stq_u t1, -8(a0) # L : save the current word (stall)
208 cmpbge zero, t2, t8 # E : test new word for eos
209 beq t8, $u_loop # U : (stall)
213 /* We've found a zero somewhere in the source word we just read.
214 If it resides in the lower half, we have one (probably partial)
215 word to write out, and if it resides in the upper half, we
216 have one full and one partial word left to write out.
218 On entry to this basic block:
219 t0 == the shifted high-order bits from the previous source word
220 t2 == the unshifted current source word. */
222 extqh t2, a1, t1 # U :
223 or t0, t1, t1 # E : first (partial) source word complete (stall)
224 cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
225 bne t8, $u_final # U : (stall)
228 stq_u t1, 0(a0) # L : the null was in the high-order bits
230 extql t2, a1, t1 # U :
231 cmpbge zero, t1, t8 # E : (stall)
233 /* Take care of a final (probably partial) result word.
234 On entry to this basic block:
235 t1 == assembled source word
236 t8 == cmpbge mask that found the null. */
238 negq t8, t6 # E : isolate low bit set
239 and t6, t8, t12 # E : (stall)
240 and t12, 0x80, t6 # E : avoid dest word load if we can (stall)
241 bne t6, 1f # U : (stall)
243 ldq_u t0, 0(a0) # E :
244 subq t12, 1, t6 # E :
245 or t6, t12, t8 # E : (stall)
246 zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
248 zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
249 or t0, t1, t1 # E : (stall)
253 1: stq_u t1, 0(a0) # L :
254 ret (t9) # L0 : Latency=3
258 /* Unaligned copy entry point. */
262 ldq_u t1, 0(a1) # L : load first source word
263 and a0, 7, t4 # E : find dest misalignment
264 and a1, 7, t5 # E : find src misalignment
265 /* Conditionally load the first destination word and a bytemask
266 with 0xff indicating that the destination byte is sacrosanct. */
271 ldq_u t0, 0(a0) # L :
274 mskql t6, a0, t6 # U :
279 subq a1, t4, a1 # E : sub dest misalignment from src addr
280 /* If source misalignment is larger than dest misalignment, we need
281 extra startup checks to avoid SEGV. */
282 cmplt t4, t5, t12 # E :
283 beq t12, $u_head # U :
284 lda t2, -1 # E : mask out leading garbage in source
286 mskqh t2, t5, t2 # U :
287 ornot t1, t2, t3 # E : (stall)
288 cmpbge zero, t3, t8 # E : is there a zero? (stall)
289 beq t8, $u_head # U : (stall)
291 /* At this point we've found a zero in the first partial word of
292 the source. We need to isolate the valid source data and mask
293 it into the original destination data. (Incidentally, we know
294 that we'll need at least one byte of that original dest word.) */
296 ldq_u t0, 0(a0) # L :
297 negq t8, t6 # E : build bitmask of bytes <= zero
298 and t6, t8, t12 # E : (stall)
301 subq t12, 1, t6 # E :
302 or t6, t12, t8 # E : (stall)
303 srl t12, t5, t12 # U : adjust final null return value
304 zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
306 and t1, t2, t1 # E : to source validity mask
307 extql t2, a1, t2 # U :
308 extql t1, a1, t1 # U : (stall)
309 andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
311 or t0, t1, t1 # e1 : and put it there
312 stq_u t1, 0(a0) # .. e0 : (stall)