2 * Copyright © 2012 Raspberry Pi Foundation
3 * Copyright © 2012 RISC OS Open Ltd
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of the copyright holders not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission. The copyright holders make no
12 * representations about the suitability of this software for any purpose. It
13 * is provided "as is" without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Ben Avison (bavison@riscosopen.org)
29 * Because the alignment of pixel data to cachelines, and even the number of
30 * cachelines per row can vary from row to row, and because of the need to
31 * preload each scanline once and only once, this prefetch strategy treats
32 * each row of pixels independently. When a pixel row is long enough, there
33 * are three distinct phases of prefetch:
34 * * an inner loop section, where each time a cacheline of data is
35 * processed, another cacheline is preloaded (the exact distance ahead is
36 * determined empirically using profiling results from lowlevel-blt-bench)
37 * * a leading section, where enough cachelines are preloaded to ensure no
38 * cachelines escape being preloaded when the inner loop starts
39 * * a trailing section, where a limited number (0 or more) of cachelines
40 * are preloaded to deal with data (if any) that hangs off the end of the
41 * last iteration of the inner loop, plus any trailing bytes that were not
42 * enough to make up one whole iteration of the inner loop
44 * There are (in general) three distinct code paths, selected between
45 * depending upon how long the pixel row is. If it is long enough that there
46 * is at least one iteration of the inner loop (as described above) then
47 * this is described as the "wide" case. If it is shorter than that, but
48 * there are still enough bytes output that there is at least one 16-byte-
49 * long, 16-byte-aligned write to the destination (the optimum type of
50 * write), then this is the "medium" case. If it is not even this long, then
51 * this is the "narrow" case, and there is no attempt to align writes to
52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the
53 * cachelines containing data from the pixel row are prefetched up-front.
57 * Determine whether we put the arguments on the stack for debugging.
62 * Bit flags for 'generate_composite_function' macro which are used
63 * to tune generated functions behavior.
65 .set FLAG_DST_WRITEONLY
, 0
66 .set FLAG_DST_READWRITE
, 1
67 .set FLAG_COND_EXEC
, 0
68 .set FLAG_BRANCH_OVER
, 2
69 .set FLAG_PROCESS_PRESERVES_PSR
, 0
70 .set FLAG_PROCESS_CORRUPTS_PSR
, 4
71 .set FLAG_PROCESS_DOESNT_STORE
, 0
72 .set FLAG_PROCESS_DOES_STORE
, 8 /* usually because it needs to conditionally skip it */
73 .set FLAG_NO_SPILL_LINE_VARS
, 0
74 .set FLAG_SPILL_LINE_VARS_WIDE
, 16
75 .set FLAG_SPILL_LINE_VARS_NON_WIDE
, 32
76 .set FLAG_SPILL_LINE_VARS
, 48
77 .set FLAG_PROCESS_CORRUPTS_SCRATCH
, 0
78 .set FLAG_PROCESS_PRESERVES_SCRATCH
, 64
79 .set FLAG_PROCESS_PRESERVES_WK0
, 0
80 .set FLAG_PROCESS_CORRUPTS_WK0
, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
81 .set FLAG_PRELOAD_DST
, 0
82 .set FLAG_NO_PRELOAD_DST
, 256
85 * Number of bytes by which to adjust preload offset of destination
86 * buffer (allows preload instruction to be moved before the load(s))
88 .set DST_PRELOAD_BIAS
, 0
91 * Offset into stack where mask and source pointer/stride can be accessed.
94 .set ARGS_STACK_OFFSET
, (9*4+9*4)
96 .set ARGS_STACK_OFFSET
, (9*4)
100 * Offset into stack where space allocated during init macro can be accessed.
102 .set LOCALS_STACK_OFFSET
, 0
105 * Constants for selecting preferable prefetch type.
107 .set PREFETCH_TYPE_NONE
, 0
108 .set PREFETCH_TYPE_STANDARD
, 1
111 * Definitions of macros for load/store of pixel data.
114 .macro pixldst op
, cond
=al
, numbytes
, reg0
, reg1
, reg2
, reg3
, base
, unaligned
=0
117 op
&r
&cond WK
®0
, [base
], #4
118 op
&r
&cond WK
®1
, [base
], #4
119 op
&r
&cond WK
®2
, [base
], #4
120 op
&r
&cond WK
®3
, [base
], #4
122 op
&m
&cond
&ia base
!, {WK
®0
,WK
®1
,WK
®2
,WK
®3
}
124 .elseif numbytes
== 8
126 op
&r
&cond WK
®0
, [base
], #4
127 op
&r
&cond WK
®1
, [base
], #4
129 op
&m
&cond
&ia base
!, {WK
®0
,WK
®1
}
131 .elseif numbytes
== 4
132 op
&r
&cond WK
®0
, [base
], #4
133 .elseif numbytes
== 2
134 op
&r
&cond
&h WK
®0
, [base
], #2
135 .elseif numbytes
== 1
136 op
&r
&cond
&b WK
®0
, [base
], #1
138 .error
"unsupported size: numbytes"
142 .macro pixst_baseupdated cond
, numbytes
, reg0
, reg1
, reg2
, reg3
, base
144 stm
&cond
&db base
, {WK
®0
,WK
®1
,WK
®2
,WK
®3
}
145 .elseif numbytes
== 8
146 stm
&cond
&db base
, {WK
®0
,WK
®1
}
147 .elseif numbytes
== 4
148 str
&cond WK
®0
, [base
, #-4]
149 .elseif numbytes
== 2
150 str
&cond
&h WK
®0
, [base
, #-2]
151 .elseif numbytes
== 1
152 str
&cond
&b WK
®0
, [base
, #-1]
154 .error
"unsupported size: numbytes"
158 .macro pixld cond
, numbytes
, firstreg
, base
, unaligned
159 pixldst ld
, cond
, numbytes
, %(firstreg
+0), %(firstreg
+1), %(firstreg
+2), %(firstreg
+3), base
, unaligned
162 .macro pixst cond
, numbytes
, firstreg
, base
163 .if (flags
) & FLAG_DST_READWRITE
164 pixst_baseupdated cond
, numbytes
, %(firstreg
+0), %(firstreg
+1), %(firstreg
+2), %(firstreg
+3), base
166 pixldst st
, cond
, numbytes
, %(firstreg
+0), %(firstreg
+1), %(firstreg
+2), %(firstreg
+3), base
170 .macro PF a
, x
:vararg
171 .if (PREFETCH_TYPE_CURRENT
== PREFETCH_TYPE_STANDARD
)
177 .macro preload_leading_step1 bpp
, ptr
, base
178 /* If the destination is already 16-byte aligned, then we need to preload
179 * between 0 and prefetch_distance (inclusive) cache lines ahead so there
180 * are no gaps when the inner loop starts.
183 PF bic
, ptr
, base
, #31
185 .rept prefetch_distance
+1
186 PF pld
, [ptr
, #OFFSET]
187 .set OFFSET
, OFFSET
+32
192 .macro preload_leading_step2 bpp
, bpp_shift
, ptr
, base
193 /* However, if the destination is not 16-byte aligned, we may need to
194 * preload more cache lines than that. The question we need to ask is:
195 * are the bytes corresponding to the leading pixels more than the amount
196 * by which the source pointer will be rounded down for preloading, and if
197 * so, by how many cache lines? Effectively, we want to calculate
198 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
199 * inner_loop_offset = (src+leading_bytes)&31
200 * extra_needed = leading_bytes - inner_loop_offset
201 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
202 * possible when there are 4 src bytes for every 1 dst byte).
206 /* The test can be simplified further when preloading the destination */
210 .if bpp
/dst_w_bpp
== 4
211 PF add
, SCRATCH
, base
, WK0
, lsl
#bpp_shift-dst_bpp_shift
212 PF
and, SCRATCH
, SCRATCH
, #31
213 PF rsb
, SCRATCH
, SCRATCH
, WK0
, lsl
#bpp_shift-dst_bpp_shift
214 PF sub
, SCRATCH
, SCRATCH
, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
215 PF movs
, SCRATCH
, SCRATCH
, lsl
#32-6 /* so this sets NC / nc / Nc */
218 PF pld
, [ptr
, #32*(prefetch_distance+2)]
220 PF mov
, SCRATCH
, base
, lsl
#32-5
221 PF add
, SCRATCH
, SCRATCH
, WK0
, lsl
#32-5+bpp_shift-dst_bpp_shift
222 PF rsbs
, SCRATCH
, SCRATCH
, WK0
, lsl
#32-5+bpp_shift-dst_bpp_shift
226 60: PF pld
, [ptr
, #32*(prefetch_distance+1)]
231 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
232 .macro preload_middle bpp
, base
, scratch_holds_offset
234 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
235 .if IS_END_OF_GROUP(SUBBLOCK
,256/128*dst_w_bpp
/bpp
)
236 .if scratch_holds_offset
237 PF pld
, [base
, SCRATCH
]
239 PF bic
, SCRATCH
, base
, #31
240 PF pld
, [SCRATCH
, #32*prefetch_distance]
246 .macro preload_trailing bpp
, bpp_shift
, base
248 .if bpp
*pix_per_block
> 256
249 /* Calculations are more complex if more than one fetch per block */
250 PF
and, WK1
, base
, #31
251 PF add
, WK1
, WK1
, WK0
, lsl
#bpp_shift
252 PF add
, WK1
, WK1
, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
253 PF bic
, SCRATCH
, base
, #31
254 80: PF pld
, [SCRATCH
, #32*(prefetch_distance+1)]
255 PF add
, SCRATCH
, SCRATCH
, #32
256 PF subs
, WK1
, WK1
, #32
259 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
260 PF mov
, SCRATCH
, base
, lsl
#32-5
261 PF adds
, SCRATCH
, SCRATCH
, X
, lsl
#32-5+bpp_shift
262 PF adceqs
, SCRATCH
, SCRATCH
, #0
263 /* The instruction above has two effects: ensures Z is only
264 * set if C was clear (so Z indicates that both shifted quantities
265 * were 0), and clears C if Z was set (so C indicates that the sum
266 * of the shifted quantities was greater and not equal to 32) */
268 PF bic
, SCRATCH
, base
, #31
270 PF pld
, [SCRATCH
, #32*(prefetch_distance+2)]
271 81: PF pld
, [SCRATCH
, #32*(prefetch_distance+1)]
278 .macro preload_line narrow_case
, bpp
, bpp_shift
, base
279 /* "narrow_case" - just means that the macro was invoked from the "narrow"
280 * code path rather than the "medium" one - because in the narrow case,
281 * the row of pixels is known to output no more than 30 bytes, then
282 * (assuming the source pixels are no wider than the the destination
283 * pixels) they cannot possibly straddle more than 2 32-byte cachelines,
284 * meaning there's no need for a loop.
285 * "bpp" - number of bits per pixel in the channel (source, mask or
286 * destination) that's being preloaded, or 0 if this channel is not used
288 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
289 * "base" - base address register of channel to preload (SRC, MASK or DST)
292 .if narrow_case
&& (bpp
<= dst_w_bpp
)
293 /* In these cases, each line for each channel is in either 1 or 2 cache lines */
294 PF bic
, WK0
, base
, #31
296 PF add
, WK1
, base
, X
, LSL
#bpp_shift
298 PF bic
, WK1
, WK1
, #31
304 PF bic
, WK0
, base
, #31
306 PF add
, WK1
, base
, X
, lsl
#bpp_shift
308 PF bic
, WK1
, WK1
, #31
311 91: PF add
, WK0
, WK0
, #32
321 .macro conditional_process1_helper cond
, process_head
, process_tail
, numbytes
, firstreg
, unaligned_src
, unaligned_mask
, decrementx
322 process_head cond
, numbytes
, firstreg
, unaligned_src
, unaligned_mask
, 0
324 sub
&cond X
, X
, #8*numbytes/dst_w_bpp
326 process_tail cond
, numbytes
, firstreg
327 .if !((flags
) & FLAG_PROCESS_DOES_STORE
)
328 pixst cond
, numbytes
, firstreg
, DST
332 .macro conditional_process1 cond
, process_head
, process_tail
, numbytes
, firstreg
, unaligned_src
, unaligned_mask
, decrementx
333 .if (flags
) & FLAG_BRANCH_OVER
343 conditional_process1_helper
, process_head
, process_tail
, numbytes
, firstreg
, unaligned_src
, unaligned_mask
, decrementx
346 conditional_process1_helper cond
, process_head
, process_tail
, numbytes
, firstreg
, unaligned_src
, unaligned_mask
, decrementx
350 .macro conditional_process2 test
, cond1
, cond2
, process_head
, process_tail
, numbytes1
, numbytes2
, firstreg1
, firstreg2
, unaligned_src
, unaligned_mask
, decrementx
351 .if (flags
) & (FLAG_DST_READWRITE
| FLAG_BRANCH_OVER
| FLAG_PROCESS_CORRUPTS_PSR
| FLAG_PROCESS_DOES_STORE
)
352 /* Can't interleave reads and writes */
354 conditional_process1 cond1
, process_head
, process_tail
, numbytes1
, firstreg1
, unaligned_src
, unaligned_mask
, decrementx
355 .if (flags
) & FLAG_PROCESS_CORRUPTS_PSR
358 conditional_process1 cond2
, process_head
, process_tail
, numbytes2
, firstreg2
, unaligned_src
, unaligned_mask
, decrementx
360 /* Can interleave reads and writes for better scheduling */
362 process_head cond1
, numbytes1
, firstreg1
, unaligned_src
, unaligned_mask
, 0
363 process_head cond2
, numbytes2
, firstreg2
, unaligned_src
, unaligned_mask
, 0
365 sub
&cond1 X
, X
, #8*numbytes1/dst_w_bpp
366 sub
&cond2 X
, X
, #8*numbytes2/dst_w_bpp
368 process_tail cond1
, numbytes1
, firstreg1
369 process_tail cond2
, numbytes2
, firstreg2
370 pixst cond1
, numbytes1
, firstreg1
, DST
371 pixst cond2
, numbytes2
, firstreg2
, DST
376 .macro test_bits_1_0_ptr
377 .if (flags
) & FLAG_PROCESS_CORRUPTS_WK0
378 movs SCRATCH
, X
, lsl
#32-1 /* C,N = bits 1,0 of DST */
380 movs SCRATCH
, WK0
, lsl
#32-1 /* C,N = bits 1,0 of DST */
384 .macro test_bits_3_2_ptr
385 .if (flags
) & FLAG_PROCESS_CORRUPTS_WK0
386 movs SCRATCH
, X
, lsl
#32-3 /* C,N = bits 3, 2 of DST */
388 movs SCRATCH
, WK0
, lsl
#32-3 /* C,N = bits 3, 2 of DST */
392 .macro leading_15bytes process_head
, process_tail
393 /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
395 .if (flags
) & FLAG_PROCESS_CORRUPTS_WK0
397 sub X
, X
, WK0
, lsr
#dst_bpp_shift
398 str X
, [sp
, #LINE_SAVED_REG_COUNT*4]
401 /* Use unaligned loads in all cases for simplicity */
403 conditional_process2 test_bits_1_0_ptr
, mi
, cs
, process_head
, process_tail
, 1, 2, 1, 2, 1, 1, DECREMENT_X
404 .elseif dst_w_bpp
== 16
406 conditional_process1 cs
, process_head
, process_tail
, 2, 2, 1, 1, DECREMENT_X
408 conditional_process2 test_bits_3_2_ptr
, mi
, cs
, process_head
, process_tail
, 4, 8, 1, 2, 1, 1, DECREMENT_X
409 .if (flags
) & FLAG_PROCESS_CORRUPTS_WK0
410 ldr X
, [sp
, #LINE_SAVED_REG_COUNT*4]
414 .macro test_bits_3_2_pix
415 movs SCRATCH
, X
, lsl
#dst_bpp_shift+32-3
418 .macro test_bits_1_0_pix
420 movs SCRATCH
, X
, lsl
#dst_bpp_shift+32-1
422 movs SCRATCH
, X
, lsr
#1
426 .macro trailing_15bytes process_head
, process_tail
, unaligned_src
, unaligned_mask
427 conditional_process2 test_bits_3_2_pix
, cs
, mi
, process_head
, process_tail
, 8, 4, 0, 2, unaligned_src
, unaligned_mask
, 0
430 conditional_process1 cs
, process_head
, process_tail
, 2, 0, unaligned_src
, unaligned_mask
, 0
431 .elseif dst_w_bpp
== 8
432 conditional_process2 test_bits_1_0_pix
, cs
, mi
, process_head
, process_tail
, 2, 1, 0, 1, unaligned_src
, unaligned_mask
, 0
437 .macro wide_case_inner_loop process_head
, process_tail
, unaligned_src
, unaligned_mask
, dst_alignment
439 .set SUBBLOCK
, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
440 .rept pix_per_block
*dst_w_bpp
/128
441 process_head
, 16, 0, unaligned_src
, unaligned_mask
, 1
442 .if (src_bpp
> 0) && (mask_bpp
== 0) && ((flags
) & FLAG_PROCESS_PRESERVES_SCRATCH
)
443 preload_middle src_bpp
, SRC
, 1
444 .elseif (src_bpp
== 0) && (mask_bpp
> 0) && ((flags
) & FLAG_PROCESS_PRESERVES_SCRATCH
)
445 preload_middle mask_bpp
, MASK
, 1
447 preload_middle src_bpp
, SRC
, 0
448 preload_middle mask_bpp
, MASK
, 0
450 .if (dst_r_bpp
> 0) && ((SUBBLOCK
% 2) == 0) && (((flags
) & FLAG_NO_PRELOAD_DST
) == 0)
451 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
452 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
453 * preloads for, to achieve staggered prefetches for multiple channels, because there are
454 * always two STMs per prefetch, so there is always an opposite STM on which to put the
455 * preload. Note, no need to BIC the base register here */
456 PF pld
, [DST
, #32*prefetch_distance - dst_alignment]
459 .if !((flags
) & FLAG_PROCESS_DOES_STORE
)
462 .set SUBBLOCK
, SUBBLOCK
+1
464 subs X
, X
, #pix_per_block
468 .macro wide_case_inner_loop_and_trailing_pixels process_head
, process_tail
, process_inner_loop
, exit_label
, unaligned_src
, unaligned_mask
469 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
473 process_inner_loop process_head
, process_tail
, unaligned_src
, unaligned_mask
, 16 + DST_PRELOAD_BIAS
477 process_inner_loop process_head
, process_tail
, unaligned_src
, unaligned_mask
, 0 + DST_PRELOAD_BIAS
479 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
480 .if (src_bpp
*pix_per_block
> 256) || (mask_bpp
*pix_per_block
> 256) || (dst_r_bpp
*pix_per_block
> 256)
481 PF
and, WK0
, X
, #pix_per_block-1
483 preload_trailing src_bpp
, src_bpp_shift
, SRC
484 preload_trailing mask_bpp
, mask_bpp_shift
, MASK
485 .if ((flags
) & FLAG_NO_PRELOAD_DST
) == 0
486 preload_trailing dst_r_bpp
, dst_bpp_shift
, DST
488 add X
, X
, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
489 /* The remainder of the line is handled identically to the medium case */
490 medium_case_inner_loop_and_trailing_pixels process_head
, process_tail
,, exit_label
, unaligned_src
, unaligned_mask
493 .macro medium_case_inner_loop_and_trailing_pixels process_head
, process_tail
, unused
, exit_label
, unaligned_src
, unaligned_mask
495 process_head
, 16, 0, unaligned_src
, unaligned_mask
, 0
497 .if !((flags
) & FLAG_PROCESS_DOES_STORE
)
500 subs X
, X
, #128/dst_w_bpp
502 /* Trailing pixels */
503 tst X
, #128/dst_w_bpp - 1
505 trailing_15bytes process_head
, process_tail
, unaligned_src
, unaligned_mask
508 .macro narrow_case_inner_loop_and_trailing_pixels process_head
, process_tail
, unused
, exit_label
, unaligned_src
, unaligned_mask
509 tst X
, #16*8/dst_w_bpp
510 conditional_process1 ne
, process_head
, process_tail
, 16, 0, unaligned_src
, unaligned_mask
, 0
511 /* Trailing pixels */
512 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
513 trailing_15bytes process_head
, process_tail
, unaligned_src
, unaligned_mask
516 .macro switch_on_alignment action
, process_head
, process_tail
, process_inner_loop
, exit_label
517 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
518 .if mask_bpp
== 8 || mask_bpp
== 16
522 .if src_bpp
== 8 || src_bpp
== 16
526 action process_head
, process_tail
, process_inner_loop
, exit_label
, 0, 0
527 .if src_bpp
== 8 || src_bpp
== 16
530 action process_head
, process_tail
, process_inner_loop
, exit_label
, 1, 0
532 .if mask_bpp
== 8 || mask_bpp
== 16
535 .if src_bpp
== 8 || src_bpp
== 16
539 action process_head
, process_tail
, process_inner_loop
, exit_label
, 0, 1
540 .if src_bpp
== 8 || src_bpp
== 16
543 action process_head
, process_tail
, process_inner_loop
, exit_label
, 1, 1
549 .macro end_of_line restore_x
, vars_spilled
, loop_label
, last_one
551 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
552 /* This is ldmia sp,{} */
553 .word
0xE89D0000 | LINE_SAVED_REGS
557 .if (LINE_SAVED_REGS
) & (1<<1)
561 add DST
, DST
, STRIDE_D
563 add SRC
, SRC
, STRIDE_S
566 add MASK
, MASK
, STRIDE_M
579 .if (!vars_spilled
) && ((flags
) & FLAG_SPILL_LINE_VARS
)
586 .macro generate_composite_function fname
, \
591 prefetch_distance_
, \
599 pixman_asm_function fname
602 * Make some macro arguments globally visible and accessible
605 .set src_bpp
, src_bpp_
606 .set mask_bpp
, mask_bpp_
607 .set dst_w_bpp
, dst_w_bpp_
609 .set prefetch_distance
, prefetch_distance_
612 * Select prefetch type for this function.
614 .if prefetch_distance
== 0
615 .set PREFETCH_TYPE_CURRENT
, PREFETCH_TYPE_NONE
617 .set PREFETCH_TYPE_CURRENT
, PREFETCH_TYPE_STANDARD
621 .set src_bpp_shift
, 2
622 .elseif src_bpp
== 24
623 .set src_bpp_shift
, 0
624 .elseif src_bpp
== 16
625 .set src_bpp_shift
, 1
627 .set src_bpp_shift
, 0
629 .set src_bpp_shift
, -1
631 .error
"requested src bpp (src_bpp) is not supported"
635 .set mask_bpp_shift
, 2
636 .elseif mask_bpp
== 24
637 .set mask_bpp_shift
, 0
638 .elseif mask_bpp
== 8
639 .set mask_bpp_shift
, 0
640 .elseif mask_bpp
== 0
641 .set mask_bpp_shift
, -1
643 .error
"requested mask bpp (mask_bpp) is not supported"
647 .set dst_bpp_shift
, 2
648 .elseif dst_w_bpp
== 24
649 .set dst_bpp_shift
, 0
650 .elseif dst_w_bpp
== 16
651 .set dst_bpp_shift
, 1
652 .elseif dst_w_bpp
== 8
653 .set dst_bpp_shift
, 0
655 .error
"requested dst bpp (dst_w_bpp) is not supported"
658 .if (((flags
) & FLAG_DST_READWRITE
) != 0)
659 .set dst_r_bpp
, dst_w_bpp
664 .set pix_per_block
, 16*8/dst_w_bpp
666 .if 32*8/src_bpp
> pix_per_block
667 .set pix_per_block
, 32*8/src_bpp
671 .if 32*8/mask_bpp
> pix_per_block
672 .set pix_per_block
, 32*8/mask_bpp
676 .if 32*8/dst_r_bpp
> pix_per_block
677 .set pix_per_block
, 32*8/dst_r_bpp
681 /* The standard entry conditions set up by pixman-arm-common.h are:
682 * r0 = width (pixels)
684 * r2 = pointer to top-left pixel of destination
685 * r3 = destination stride (pixels)
686 * [sp] = source pixel value, or pointer to top-left pixel of source
687 * [sp,#4] = 0 or source stride (pixels)
688 * The following arguments are unused for non-mask operations
689 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
690 * [sp,#12] = 0 or mask stride (pixels)
694 * Assign symbolic names to registers
696 X
.req r0
/* pixels to go on this line */
697 Y
.req r1
/* lines to go */
698 DST
.req r2
/* destination pixel pointer */
699 STRIDE_D
.req r3
/* destination stride (bytes, minus width) */
700 SRC
.req r4
/* source pixel pointer */
701 STRIDE_S
.req r5
/* source stride (bytes, minus width) */
702 MASK
.req r6
/* mask pixel pointer (if applicable) */
703 STRIDE_M
.req r7
/* mask stride (bytes, minus width) */
704 WK0
.req r8
/* pixel data registers */
709 ORIG_W
.req r14
/* width (pixels) */
711 push
{r4
-r11
, lr
} /* save all registers */
721 ldr SRC
, [sp
, #ARGS_STACK_OFFSET]
722 ldr STRIDE_S
, [sp
, #ARGS_STACK_OFFSET+4]
725 ldr MASK
, [sp
, #ARGS_STACK_OFFSET+8]
726 ldr STRIDE_M
, [sp
, #ARGS_STACK_OFFSET+12]
737 .if (flags
) & FLAG_PROCESS_CORRUPTS_WK0
738 /* Reserve a word in which to store X during leading pixels */
740 .set ARGS_STACK_OFFSET
, ARGS_STACK_OFFSET
+4
741 .set LOCALS_STACK_OFFSET
, LOCALS_STACK_OFFSET
+4
744 lsl STRIDE_D
, #dst_bpp_shift /* stride in bytes */
745 sub STRIDE_D
, STRIDE_D
, X
, lsl
#dst_bpp_shift
747 lsl STRIDE_S
, #src_bpp_shift
748 sub STRIDE_S
, STRIDE_S
, X
, lsl
#src_bpp_shift
751 lsl STRIDE_M
, #mask_bpp_shift
752 sub STRIDE_M
, STRIDE_M
, X
, lsl
#mask_bpp_shift
755 /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
756 cmp X
, #2*16*8/dst_w_bpp - 1
758 .if src_bpp
|| mask_bpp
|| dst_r_bpp
/* Wide and medium cases are the same for fill */
759 /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
760 cmp X
, #(prefetch_distance+3)*pix_per_block - 1
764 /* Adjust X so that the decrement instruction can also test for
765 * inner loop termination. We want it to stop when there are
766 * (prefetch_distance+1) complete blocks to go. */
767 sub X
, X
, #(prefetch_distance+2)*pix_per_block
769 .if (flags
) & FLAG_SPILL_LINE_VARS_WIDE
770 /* This is stmdb sp!,{} */
771 .word
0xE92D0000 | LINE_SAVED_REGS
772 .set ARGS_STACK_OFFSET
, ARGS_STACK_OFFSET
+ LINE_SAVED_REG_COUNT
*4
773 .set LOCALS_STACK_OFFSET
, LOCALS_STACK_OFFSET
+ LINE_SAVED_REG_COUNT
*4
777 preload_leading_step1 src_bpp
, WK1
, SRC
778 preload_leading_step1 mask_bpp
, WK2
, MASK
779 .if ((flags
) & FLAG_NO_PRELOAD_DST
) == 0
780 preload_leading_step1 dst_r_bpp
, WK3
, DST
785 rsb WK0
, WK0
, #16 /* number of leading bytes until destination aligned */
787 preload_leading_step2 src_bpp
, src_bpp_shift
, WK1
, SRC
788 preload_leading_step2 mask_bpp
, mask_bpp_shift
, WK2
, MASK
789 .if ((flags
) & FLAG_NO_PRELOAD_DST
) == 0
790 preload_leading_step2 dst_r_bpp
, dst_bpp_shift
, WK3
, DST
793 leading_15bytes process_head
, process_tail
795 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
796 .if (src_bpp
> 0) && (mask_bpp
== 0) && ((flags
) & FLAG_PROCESS_PRESERVES_SCRATCH
)
797 and SCRATCH
, SRC
, #31
798 rsb SCRATCH
, SCRATCH
, #32*prefetch_distance
799 .elseif (src_bpp
== 0) && (mask_bpp
> 0) && ((flags
) & FLAG_PROCESS_PRESERVES_SCRATCH
)
800 and SCRATCH
, MASK
, #31
801 rsb SCRATCH
, SCRATCH
, #32*prefetch_distance
803 .ifc
"process_inner_loop",""
804 switch_on_alignment wide_case_inner_loop_and_trailing_pixels
, process_head
, process_tail
, wide_case_inner_loop
, 157f
806 switch_on_alignment wide_case_inner_loop_and_trailing_pixels
, process_head
, process_tail
, process_inner_loop
, 157f
809 157: /* Check for another line */
810 end_of_line
1, %((flags
) & FLAG_SPILL_LINE_VARS_WIDE
), 151b
811 .if (flags
) & FLAG_SPILL_LINE_VARS_WIDE
812 .set ARGS_STACK_OFFSET
, ARGS_STACK_OFFSET
- LINE_SAVED_REG_COUNT
*4
813 .set LOCALS_STACK_OFFSET
, LOCALS_STACK_OFFSET
- LINE_SAVED_REG_COUNT
*4
819 160: /* Medium case */
821 .if (flags
) & FLAG_SPILL_LINE_VARS_NON_WIDE
822 /* This is stmdb sp!,{} */
823 .word
0xE92D0000 | LINE_SAVED_REGS
824 .set ARGS_STACK_OFFSET
, ARGS_STACK_OFFSET
+ LINE_SAVED_REG_COUNT
*4
825 .set LOCALS_STACK_OFFSET
, LOCALS_STACK_OFFSET
+ LINE_SAVED_REG_COUNT
*4
829 preload_line
0, src_bpp
, src_bpp_shift
, SRC
/* in: X, corrupts: WK0-WK1 */
830 preload_line
0, mask_bpp
, mask_bpp_shift
, MASK
831 .if ((flags
) & FLAG_NO_PRELOAD_DST
) == 0
832 preload_line
0, dst_r_bpp
, dst_bpp_shift
, DST
835 sub X
, X
, #128/dst_w_bpp /* simplifies inner loop termination */
838 rsb WK0
, WK0
, #16 /* number of leading bytes until destination aligned */
840 leading_15bytes process_head
, process_tail
842 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
843 switch_on_alignment medium_case_inner_loop_and_trailing_pixels
, process_head
, process_tail
,, 167f
845 167: /* Check for another line */
846 end_of_line
1, %((flags
) & FLAG_SPILL_LINE_VARS_NON_WIDE
), 161b
850 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
854 .if (flags
) & FLAG_SPILL_LINE_VARS_NON_WIDE
855 /* This is stmdb sp!,{} */
856 .word
0xE92D0000 | LINE_SAVED_REGS
860 preload_line
1, src_bpp
, src_bpp_shift
, SRC
/* in: X, corrupts: WK0-WK1 */
861 preload_line
1, mask_bpp
, mask_bpp_shift
, MASK
862 .if ((flags
) & FLAG_NO_PRELOAD_DST
) == 0
863 preload_line
1, dst_r_bpp
, dst_bpp_shift
, DST
871 process_head
, 1, 0, 1, 1, 0
873 .if !((flags
) & FLAG_PROCESS_DOES_STORE
)
878 .elseif dst_w_bpp
== 16
883 process_head
, 2, 0, 1, 1, 0
885 .if !((flags
) & FLAG_PROCESS_DOES_STORE
)
890 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
891 switch_on_alignment narrow_case_inner_loop_and_trailing_pixels
, process_head
, process_tail
,, 177f
893 177: /* Check for another line */
894 end_of_line
%(dst_w_bpp
< 32), %((flags
) & FLAG_SPILL_LINE_VARS_NON_WIDE
), 171b
, last_one
895 .if (flags
) & FLAG_SPILL_LINE_VARS_NON_WIDE
896 .set ARGS_STACK_OFFSET
, ARGS_STACK_OFFSET
- LINE_SAVED_REG_COUNT
*4
897 .set LOCALS_STACK_OFFSET
, LOCALS_STACK_OFFSET
- LINE_SAVED_REG_COUNT
*4
901 .if (flags
) & FLAG_SPILL_LINE_VARS
902 add sp
, sp
, #LINE_SAVED_REG_COUNT*4
905 .if (flags
) & FLAG_PROCESS_CORRUPTS_WK0
906 .set ARGS_STACK_OFFSET
, ARGS_STACK_OFFSET
-4
907 .set LOCALS_STACK_OFFSET
, LOCALS_STACK_OFFSET
-4
914 add sp
, sp
, #9*4 /* junk the debug copy of arguments */
917 pop
{r4
-r11
, pc
} /* exit */
938 .macro line_saved_regs x
:vararg
939 .set LINE_SAVED_REGS
, 0
940 .set LINE_SAVED_REG_COUNT
, 0
943 .set LINE_SAVED_REGS
, LINE_SAVED_REGS
| (1<<1)
944 .set LINE_SAVED_REG_COUNT
, LINE_SAVED_REG_COUNT
+ 1
946 .ifc
"SAVED_REG","STRIDE_D"
947 .set LINE_SAVED_REGS
, LINE_SAVED_REGS
| (1<<3)
948 .set LINE_SAVED_REG_COUNT
, LINE_SAVED_REG_COUNT
+ 1
950 .ifc
"SAVED_REG","STRIDE_S"
951 .set LINE_SAVED_REGS
, LINE_SAVED_REGS
| (1<<5)
952 .set LINE_SAVED_REG_COUNT
, LINE_SAVED_REG_COUNT
+ 1
954 .ifc
"SAVED_REG","STRIDE_M"
955 .set LINE_SAVED_REGS
, LINE_SAVED_REGS
| (1<<7)
956 .set LINE_SAVED_REG_COUNT
, LINE_SAVED_REG_COUNT
+ 1
958 .ifc
"SAVED_REG","ORIG_W"
959 .set LINE_SAVED_REGS
, LINE_SAVED_REGS
| (1<<14)
960 .set LINE_SAVED_REG_COUNT
, LINE_SAVED_REG_COUNT
+ 1
965 .macro nop_macro x
:vararg