2 * arch/alpha/lib/ev6-memset.S
4 * This is an efficient (and relatively small) implementation of the C library
5 * "memset()" function for the 21264 implementation of Alpha.
7 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
9 * Much of the information about 21264 scheduling/coding comes from:
10 * Compiler Writer's Guide for the Alpha 21264
11 * abbreviated as 'CWG' in other comments here
12 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
13 * Scheduling notation:
15 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
16 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
17 * The algorithm for the leading and trailing quadwords remains the same,
18 * however the loop has been unrolled to enable better memory throughput,
19 * and the code has been replicated for each of the entry points: __memset
20 * and __memsetw to permit better scheduling to eliminate the stalling
21 * encountered during the mask replication.
22 * A future enhancement might be to put in a byte store loop for really
23 * small (say < 32 bytes) memset()s. Whether or not that change would be
24 * a win in the kernel would depend upon the contextual usage.
25 * WARNING: Maintaining this is going to be more work than the above version,
26 * as fixes will need to be made in multiple places. The performance gain
35 .globl __constant_c_memset
45 * Serious stalling happens. The only way to mitigate this is to
46 * undertake a major re-write to interleave the constant materialization
47 * with other parts of the fall-through code. This is important, even
48 * though it makes maintenance tougher.
51 and $17,255,$1 # E : 00000000000000ch
52 insbl $17,1,$2 # U : 000000000000ch00
53 bis $16,$16,$0 # E : return value
54 ble $18,end_b # U : zero length requested?
56 addq $18,$16,$6 # E : max address to write to
57 bis $1,$2,$17 # E : 000000000000chch
58 insbl $1,2,$3 # U : 0000000000ch0000
59 insbl $1,3,$4 # U : 00000000ch000000
61 or $3,$4,$3 # E : 00000000chch0000
62 inswl $17,4,$5 # U : 0000chch00000000
63 xor $16,$6,$1 # E : will complete write be within one quadword?
64 inswl $17,6,$2 # U : chch000000000000
66 or $17,$3,$17 # E : 00000000chchchch
67 or $2,$5,$2 # E : chchchch00000000
68 bic $1,7,$1 # E : fit within a single quadword?
69 and $16,7,$3 # E : Target addr misalignment
71 or $17,$2,$17 # E : chchchchchchchch
72 beq $1,within_quad_b # U :
74 beq $3,aligned_b # U : target is 0mod8
77 * Target address is misaligned, and won't fit within a quadword
79 ldq_u $4,0($16) # L : Fetch first partial
80 bis $16,$16,$5 # E : Save the address
81 insql $17,$16,$2 # U : Insert new bytes
82 subq $3,8,$3 # E : Invert (for addressing uses)
84 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
85 mskql $4,$16,$4 # U : clear relevant parts of the quad
86 subq $16,$3,$16 # E : $16 is new aligned destination
87 bis $2,$4,$1 # E : Final bytes
90 stq_u $1,0($5) # L : Store result
97 * We are now guaranteed to be quad aligned, with at least
98 * one partial quad to write.
101 sra $18,3,$3 # U : Number of remaining quads to write
102 and $18,7,$18 # E : Number of trailing bytes to write
103 bis $16,$16,$5 # E : Save dest address
104 beq $3,no_quad_b # U : tail stuff only
107 * it's worth the effort to unroll this and use wh64 if possible
108 * Lifted a bunch of code from clear_user.S
109 * At this point, entry values are:
110 * $16 Current destination address
112 * $6 The max quadword address to write to
113 * $18 Number trailer bytes
114 * $3 Number quads to write
117 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
118 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
119 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
123 * We know we've got at least 16 quads, minimum of one trip
124 * through unrolled loop. Do a quad at a time to get us 0mod64
131 beq $1, $bigalign_b # U :
135 subq $3, 1, $3 # E : For consistency later
136 addq $1, 8, $1 # E : Increment towards zero for alignment
137 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
141 addq $5, 8, $5 # E : Inc address
142 blt $1, $alignmod64_b # U :
146 * $3 - number quads left to go
147 * $5 - target address (aligned 0mod64)
148 * $17 - mask of stuff to store
149 * Scratch registers available: $7, $2, $4, $1
150 * we know that we'll be taking a minimum of one trip through
151 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
152 * Assumes the wh64 needs to be for 2 trips through the loop in the future
153 * The wh64 is issued on for the starting destination address for trip +2
154 * through the loop, and if there are less than two trips left, the target
155 * address will be for the current trip.
159 wh64 ($4) # L1 : memory subsystem write hint
160 subq $3, 24, $2 # E : For determining future wh64 addresses
164 addq $5, 128, $4 # E : speculative target of next wh64
166 stq $17, 16($5) # L :
167 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
169 stq $17, 24($5) # L :
170 stq $17, 32($5) # L :
171 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
174 stq $17, 40($5) # L :
175 stq $17, 48($5) # L :
176 subq $3, 16, $2 # E : Repeat the loop at least once more?
179 stq $17, 56($5) # L :
180 addq $5, 64, $5 # E :
182 bge $2, $do_wh64_b # U :
187 beq $3, no_quad_b # U : Might have finished already
191 * Simple loop for trailing quadwords, or for small amounts
192 * of data (where we can't use an unrolled loop and wh64)
196 subq $3,1,$3 # E : Decrement number quads left
197 addq $5,8,$5 # E : Inc address
198 bne $3,loop_b # U : more?
202 * Write 0..7 trailing bytes.
205 beq $18,end_b # U : All done?
207 mskqh $7,$6,$2 # U : Mask final quad
209 insqh $17,$6,$4 # U : New bits
210 bis $2,$4,$1 # E : Put it all together
211 stq $1,0($5) # L : And back to memory
212 ret $31,($26),1 # L0 :
215 ldq_u $1,0($16) # L :
216 insql $17,$16,$2 # U : New bits
217 mskql $1,$16,$4 # U : Clear old
218 bis $2,$4,$2 # E : New result
223 stq_u $1,0($16) # L :
229 ret $31,($26),1 # L0 :
233 * This is the original body of code, prior to replication and
234 * rescheduling. Leave it here, as there may be calls to this
238 .ent __constant_c_memset
243 addq $18,$16,$6 # E : max address to write to
244 bis $16,$16,$0 # E : return value
245 xor $16,$6,$1 # E : will complete write be within one quadword?
246 ble $18,end # U : zero length requested?
248 bic $1,7,$1 # E : fit within a single quadword
249 beq $1,within_one_quad # U :
250 and $16,7,$3 # E : Target addr misalignment
251 beq $3,aligned # U : target is 0mod8
254 * Target address is misaligned, and won't fit within a quadword
256 ldq_u $4,0($16) # L : Fetch first partial
257 bis $16,$16,$5 # E : Save the address
258 insql $17,$16,$2 # U : Insert new bytes
259 subq $3,8,$3 # E : Invert (for addressing uses)
261 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
262 mskql $4,$16,$4 # U : clear relevant parts of the quad
263 subq $16,$3,$16 # E : $16 is new aligned destination
264 bis $2,$4,$1 # E : Final bytes
267 stq_u $1,0($5) # L : Store result
274 * We are now guaranteed to be quad aligned, with at least
275 * one partial quad to write.
278 sra $18,3,$3 # U : Number of remaining quads to write
279 and $18,7,$18 # E : Number of trailing bytes to write
280 bis $16,$16,$5 # E : Save dest address
281 beq $3,no_quad # U : tail stuff only
284 * it's worth the effort to unroll this and use wh64 if possible
285 * Lifted a bunch of code from clear_user.S
286 * At this point, entry values are:
287 * $16 Current destination address
289 * $6 The max quadword address to write to
290 * $18 Number trailer bytes
291 * $3 Number quads to write
294 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
295 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
296 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
300 * We know we've got at least 16 quads, minimum of one trip
301 * through unrolled loop. Do a quad at a time to get us 0mod64
308 beq $1, $bigalign # U :
312 subq $3, 1, $3 # E : For consistency later
313 addq $1, 8, $1 # E : Increment towards zero for alignment
314 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
318 addq $5, 8, $5 # E : Inc address
319 blt $1, $alignmod64 # U :
323 * $3 - number quads left to go
324 * $5 - target address (aligned 0mod64)
325 * $17 - mask of stuff to store
326 * Scratch registers available: $7, $2, $4, $1
327 * we know that we'll be taking a minimum of one trip through
328 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
329 * Assumes the wh64 needs to be for 2 trips through the loop in the future
330 * The wh64 is issued on for the starting destination address for trip +2
331 * through the loop, and if there are less than two trips left, the target
332 * address will be for the current trip.
336 wh64 ($4) # L1 : memory subsystem write hint
337 subq $3, 24, $2 # E : For determining future wh64 addresses
341 addq $5, 128, $4 # E : speculative target of next wh64
343 stq $17, 16($5) # L :
344 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
346 stq $17, 24($5) # L :
347 stq $17, 32($5) # L :
348 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
351 stq $17, 40($5) # L :
352 stq $17, 48($5) # L :
353 subq $3, 16, $2 # E : Repeat the loop at least once more?
356 stq $17, 56($5) # L :
357 addq $5, 64, $5 # E :
359 bge $2, $do_wh64 # U :
364 beq $3, no_quad # U : Might have finished already
368 * Simple loop for trailing quadwords, or for small amounts
369 * of data (where we can't use an unrolled loop and wh64)
373 subq $3,1,$3 # E : Decrement number quads left
374 addq $5,8,$5 # E : Inc address
375 bne $3,loop # U : more?
379 * Write 0..7 trailing bytes.
382 beq $18,end # U : All done?
384 mskqh $7,$6,$2 # U : Mask final quad
386 insqh $17,$6,$4 # U : New bits
387 bis $2,$4,$1 # E : Put it all together
388 stq $1,0($5) # L : And back to memory
389 ret $31,($26),1 # L0 :
392 ldq_u $1,0($16) # L :
393 insql $17,$16,$2 # U : New bits
394 mskql $1,$16,$4 # U : Clear old
395 bis $2,$4,$2 # E : New result
400 stq_u $1,0($16) # L :
406 ret $31,($26),1 # L0 :
407 .end __constant_c_memset
410 * This is a replicant of the __constant_c_memset code, rescheduled
411 * to mask stalls. Note that entry point names also had to change
420 inswl $17,0,$5 # U : 000000000000c1c2
421 inswl $17,2,$2 # U : 00000000c1c20000
422 bis $16,$16,$0 # E : return value
423 addq $18,$16,$6 # E : max address to write to
425 ble $18, end_w # U : zero length requested?
426 inswl $17,4,$3 # U : 0000c1c200000000
427 inswl $17,6,$4 # U : c1c2000000000000
428 xor $16,$6,$1 # E : will complete write be within one quadword?
430 or $2,$5,$2 # E : 00000000c1c2c1c2
431 or $3,$4,$17 # E : c1c2c1c200000000
432 bic $1,7,$1 # E : fit within a single quadword
433 and $16,7,$3 # E : Target addr misalignment
435 or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
436 beq $1,within_quad_w # U :
438 beq $3,aligned_w # U : target is 0mod8
441 * Target address is misaligned, and won't fit within a quadword
443 ldq_u $4,0($16) # L : Fetch first partial
444 bis $16,$16,$5 # E : Save the address
445 insql $17,$16,$2 # U : Insert new bytes
446 subq $3,8,$3 # E : Invert (for addressing uses)
448 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
449 mskql $4,$16,$4 # U : clear relevant parts of the quad
450 subq $16,$3,$16 # E : $16 is new aligned destination
451 bis $2,$4,$1 # E : Final bytes
454 stq_u $1,0($5) # L : Store result
461 * We are now guaranteed to be quad aligned, with at least
462 * one partial quad to write.
465 sra $18,3,$3 # U : Number of remaining quads to write
466 and $18,7,$18 # E : Number of trailing bytes to write
467 bis $16,$16,$5 # E : Save dest address
468 beq $3,no_quad_w # U : tail stuff only
471 * it's worth the effort to unroll this and use wh64 if possible
472 * Lifted a bunch of code from clear_user.S
473 * At this point, entry values are:
474 * $16 Current destination address
476 * $6 The max quadword address to write to
477 * $18 Number trailer bytes
478 * $3 Number quads to write
481 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
482 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
483 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
487 * We know we've got at least 16 quads, minimum of one trip
488 * through unrolled loop. Do a quad at a time to get us 0mod64
495 beq $1, $bigalign_w # U :
499 subq $3, 1, $3 # E : For consistency later
500 addq $1, 8, $1 # E : Increment towards zero for alignment
501 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
505 addq $5, 8, $5 # E : Inc address
506 blt $1, $alignmod64_w # U :
510 * $3 - number quads left to go
511 * $5 - target address (aligned 0mod64)
512 * $17 - mask of stuff to store
513 * Scratch registers available: $7, $2, $4, $1
514 * we know that we'll be taking a minimum of one trip through
515 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
516 * Assumes the wh64 needs to be for 2 trips through the loop in the future
517 * The wh64 is issued on for the starting destination address for trip +2
518 * through the loop, and if there are less than two trips left, the target
519 * address will be for the current trip.
523 wh64 ($4) # L1 : memory subsystem write hint
524 subq $3, 24, $2 # E : For determining future wh64 addresses
528 addq $5, 128, $4 # E : speculative target of next wh64
530 stq $17, 16($5) # L :
531 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
533 stq $17, 24($5) # L :
534 stq $17, 32($5) # L :
535 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
538 stq $17, 40($5) # L :
539 stq $17, 48($5) # L :
540 subq $3, 16, $2 # E : Repeat the loop at least once more?
543 stq $17, 56($5) # L :
544 addq $5, 64, $5 # E :
546 bge $2, $do_wh64_w # U :
551 beq $3, no_quad_w # U : Might have finished already
555 * Simple loop for trailing quadwords, or for small amounts
556 * of data (where we can't use an unrolled loop and wh64)
560 subq $3,1,$3 # E : Decrement number quads left
561 addq $5,8,$5 # E : Inc address
562 bne $3,loop_w # U : more?
566 * Write 0..7 trailing bytes.
569 beq $18,end_w # U : All done?
571 mskqh $7,$6,$2 # U : Mask final quad
573 insqh $17,$6,$4 # U : New bits
574 bis $2,$4,$1 # E : Put it all together
575 stq $1,0($5) # L : And back to memory
576 ret $31,($26),1 # L0 :
579 ldq_u $1,0($16) # L :
580 insql $17,$16,$2 # U : New bits
581 mskql $1,$16,$4 # U : Clear old
582 bis $2,$4,$2 # E : New result
587 stq_u $1,0($16) # L :
593 ret $31,($26),1 # L0 :