beta-0.89.2
[luatex.git] / source / libs / pixman / pixman-src / pixman / pixman-arm-simd-asm.h
blobda153c3f58571d06f787cfa9af70e035f3e9c1d7
1 /*
2 * Copyright © 2012 Raspberry Pi Foundation
3 * Copyright © 2012 RISC OS Open Ltd
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of the copyright holders not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission. The copyright holders make no
12 * representations about the suitability of this software for any purpose. It
13 * is provided "as is" without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
24 * Author: Ben Avison (bavison@riscosopen.org)
29 * Because the alignment of pixel data to cachelines, and even the number of
30 * cachelines per row can vary from row to row, and because of the need to
31 * preload each scanline once and only once, this prefetch strategy treats
32 * each row of pixels independently. When a pixel row is long enough, there
33 * are three distinct phases of prefetch:
34 * * an inner loop section, where each time a cacheline of data is
35 * processed, another cacheline is preloaded (the exact distance ahead is
36 * determined empirically using profiling results from lowlevel-blt-bench)
37 * * a leading section, where enough cachelines are preloaded to ensure no
38 * cachelines escape being preloaded when the inner loop starts
39 * * a trailing section, where a limited number (0 or more) of cachelines
40 * are preloaded to deal with data (if any) that hangs off the end of the
41 * last iteration of the inner loop, plus any trailing bytes that were not
42 * enough to make up one whole iteration of the inner loop
44 * There are (in general) three distinct code paths, selected between
45 * depending upon how long the pixel row is. If it is long enough that there
46 * is at least one iteration of the inner loop (as described above) then
47 * this is described as the "wide" case. If it is shorter than that, but
48 * there are still enough bytes output that there is at least one 16-byte-
49 * long, 16-byte-aligned write to the destination (the optimum type of
50 * write), then this is the "medium" case. If it is not even this long, then
51 * this is the "narrow" case, and there is no attempt to align writes to
52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the
53 * cachelines containing data from the pixel row are prefetched up-front.
57 * Determine whether we put the arguments on the stack for debugging.
59 #undef DEBUG_PARAMS
62 * Bit flags for 'generate_composite_function' macro which are used
63 * to tune generated functions behavior.
65 .set FLAG_DST_WRITEONLY, 0
66 .set FLAG_DST_READWRITE, 1
67 .set FLAG_COND_EXEC, 0
68 .set FLAG_BRANCH_OVER, 2
69 .set FLAG_PROCESS_PRESERVES_PSR, 0
70 .set FLAG_PROCESS_CORRUPTS_PSR, 4
71 .set FLAG_PROCESS_DOESNT_STORE, 0
72 .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */
73 .set FLAG_NO_SPILL_LINE_VARS, 0
74 .set FLAG_SPILL_LINE_VARS_WIDE, 16
75 .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32
76 .set FLAG_SPILL_LINE_VARS, 48
77 .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
79 .set FLAG_PROCESS_PRESERVES_WK0, 0
80 .set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
81 .set FLAG_PRELOAD_DST, 0
82 .set FLAG_NO_PRELOAD_DST, 256
85 * Number of bytes by which to adjust preload offset of destination
86 * buffer (allows preload instruction to be moved before the load(s))
88 .set DST_PRELOAD_BIAS, 0
91 * Offset into stack where mask and source pointer/stride can be accessed.
93 #ifdef DEBUG_PARAMS
94 .set ARGS_STACK_OFFSET, (9*4+9*4)
95 #else
96 .set ARGS_STACK_OFFSET, (9*4)
97 #endif
100 * Offset into stack where space allocated during init macro can be accessed.
102 .set LOCALS_STACK_OFFSET, 0
105 * Constants for selecting preferable prefetch type.
107 .set PREFETCH_TYPE_NONE, 0
108 .set PREFETCH_TYPE_STANDARD, 1
111 * Definitions of macros for load/store of pixel data.
114 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
115 .if numbytes == 16
116 .if unaligned == 1
117 op&r&cond WK&reg0, [base], #4
118 op&r&cond WK&reg1, [base], #4
119 op&r&cond WK&reg2, [base], #4
120 op&r&cond WK&reg3, [base], #4
121 .else
122 op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
123 .endif
124 .elseif numbytes == 8
125 .if unaligned == 1
126 op&r&cond WK&reg0, [base], #4
127 op&r&cond WK&reg1, [base], #4
128 .else
129 op&m&cond&ia base!, {WK&reg0,WK&reg1}
130 .endif
131 .elseif numbytes == 4
132 op&r&cond WK&reg0, [base], #4
133 .elseif numbytes == 2
134 op&r&cond&h WK&reg0, [base], #2
135 .elseif numbytes == 1
136 op&r&cond&b WK&reg0, [base], #1
137 .else
138 .error "unsupported size: numbytes"
139 .endif
140 .endm
142 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
143 .if numbytes == 16
144 stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
145 .elseif numbytes == 8
146 stm&cond&db base, {WK&reg0,WK&reg1}
147 .elseif numbytes == 4
148 str&cond WK&reg0, [base, #-4]
149 .elseif numbytes == 2
150 str&cond&h WK&reg0, [base, #-2]
151 .elseif numbytes == 1
152 str&cond&b WK&reg0, [base, #-1]
153 .else
154 .error "unsupported size: numbytes"
155 .endif
156 .endm
158 .macro pixld cond, numbytes, firstreg, base, unaligned
159 pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
160 .endm
162 .macro pixst cond, numbytes, firstreg, base
163 .if (flags) & FLAG_DST_READWRITE
164 pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
165 .else
166 pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
167 .endif
168 .endm
170 .macro PF a, x:vararg
171 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
173 .endif
174 .endm
177 .macro preload_leading_step1 bpp, ptr, base
178 /* If the destination is already 16-byte aligned, then we need to preload
179 * between 0 and prefetch_distance (inclusive) cache lines ahead so there
180 * are no gaps when the inner loop starts.
182 .if bpp > 0
183 PF bic, ptr, base, #31
184 .set OFFSET, 0
185 .rept prefetch_distance+1
186 PF pld, [ptr, #OFFSET]
187 .set OFFSET, OFFSET+32
188 .endr
189 .endif
190 .endm
192 .macro preload_leading_step2 bpp, bpp_shift, ptr, base
193 /* However, if the destination is not 16-byte aligned, we may need to
194 * preload more cache lines than that. The question we need to ask is:
195 * are the bytes corresponding to the leading pixels more than the amount
196 * by which the source pointer will be rounded down for preloading, and if
197 * so, by how many cache lines? Effectively, we want to calculate
198 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
199 * inner_loop_offset = (src+leading_bytes)&31
200 * extra_needed = leading_bytes - inner_loop_offset
201 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
202 * possible when there are 4 src bytes for every 1 dst byte).
204 .if bpp > 0
205 .ifc base,DST
206 /* The test can be simplified further when preloading the destination */
207 PF tst, base, #16
208 PF beq, 61f
209 .else
210 .if bpp/dst_w_bpp == 4
211 PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
212 PF and, SCRATCH, SCRATCH, #31
213 PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
214 PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
215 PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */
216 PF bcs, 61f
217 PF bpl, 60f
218 PF pld, [ptr, #32*(prefetch_distance+2)]
219 .else
220 PF mov, SCRATCH, base, lsl #32-5
221 PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
222 PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
223 PF bls, 61f
224 .endif
225 .endif
226 60: PF pld, [ptr, #32*(prefetch_distance+1)]
228 .endif
229 .endm
231 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
232 .macro preload_middle bpp, base, scratch_holds_offset
233 .if bpp > 0
234 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
235 .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
236 .if scratch_holds_offset
237 PF pld, [base, SCRATCH]
238 .else
239 PF bic, SCRATCH, base, #31
240 PF pld, [SCRATCH, #32*prefetch_distance]
241 .endif
242 .endif
243 .endif
244 .endm
246 .macro preload_trailing bpp, bpp_shift, base
247 .if bpp > 0
248 .if bpp*pix_per_block > 256
249 /* Calculations are more complex if more than one fetch per block */
250 PF and, WK1, base, #31
251 PF add, WK1, WK1, WK0, lsl #bpp_shift
252 PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
253 PF bic, SCRATCH, base, #31
254 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
255 PF add, SCRATCH, SCRATCH, #32
256 PF subs, WK1, WK1, #32
257 PF bhi, 80b
258 .else
259 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
260 PF mov, SCRATCH, base, lsl #32-5
261 PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
262 PF adceqs, SCRATCH, SCRATCH, #0
263 /* The instruction above has two effects: ensures Z is only
264 * set if C was clear (so Z indicates that both shifted quantities
265 * were 0), and clears C if Z was set (so C indicates that the sum
266 * of the shifted quantities was greater and not equal to 32) */
267 PF beq, 82f
268 PF bic, SCRATCH, base, #31
269 PF bcc, 81f
270 PF pld, [SCRATCH, #32*(prefetch_distance+2)]
271 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
273 .endif
274 .endif
275 .endm
278 .macro preload_line narrow_case, bpp, bpp_shift, base
279 /* "narrow_case" - just means that the macro was invoked from the "narrow"
280 * code path rather than the "medium" one - because in the narrow case,
281 * the row of pixels is known to output no more than 30 bytes, then
282 * (assuming the source pixels are no wider than the the destination
283 * pixels) they cannot possibly straddle more than 2 32-byte cachelines,
284 * meaning there's no need for a loop.
285 * "bpp" - number of bits per pixel in the channel (source, mask or
286 * destination) that's being preloaded, or 0 if this channel is not used
287 * for reading
288 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
289 * "base" - base address register of channel to preload (SRC, MASK or DST)
291 .if bpp > 0
292 .if narrow_case && (bpp <= dst_w_bpp)
293 /* In these cases, each line for each channel is in either 1 or 2 cache lines */
294 PF bic, WK0, base, #31
295 PF pld, [WK0]
296 PF add, WK1, base, X, LSL #bpp_shift
297 PF sub, WK1, WK1, #1
298 PF bic, WK1, WK1, #31
299 PF cmp, WK1, WK0
300 PF beq, 90f
301 PF pld, [WK1]
303 .else
304 PF bic, WK0, base, #31
305 PF pld, [WK0]
306 PF add, WK1, base, X, lsl #bpp_shift
307 PF sub, WK1, WK1, #1
308 PF bic, WK1, WK1, #31
309 PF cmp, WK1, WK0
310 PF beq, 92f
311 91: PF add, WK0, WK0, #32
312 PF cmp, WK0, WK1
313 PF pld, [WK0]
314 PF bne, 91b
316 .endif
317 .endif
318 .endm
321 .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
322 process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
323 .if decrementx
324 sub&cond X, X, #8*numbytes/dst_w_bpp
325 .endif
326 process_tail cond, numbytes, firstreg
327 .if !((flags) & FLAG_PROCESS_DOES_STORE)
328 pixst cond, numbytes, firstreg, DST
329 .endif
330 .endm
332 .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
333 .if (flags) & FLAG_BRANCH_OVER
334 .ifc cond,mi
335 bpl 100f
336 .endif
337 .ifc cond,cs
338 bcc 100f
339 .endif
340 .ifc cond,ne
341 beq 100f
342 .endif
343 conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
344 100:
345 .else
346 conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
347 .endif
348 .endm
350 .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
351 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
352 /* Can't interleave reads and writes */
353 test
354 conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
355 .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
356 test
357 .endif
358 conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
359 .else
360 /* Can interleave reads and writes for better scheduling */
361 test
362 process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
363 process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
364 .if decrementx
365 sub&cond1 X, X, #8*numbytes1/dst_w_bpp
366 sub&cond2 X, X, #8*numbytes2/dst_w_bpp
367 .endif
368 process_tail cond1, numbytes1, firstreg1
369 process_tail cond2, numbytes2, firstreg2
370 pixst cond1, numbytes1, firstreg1, DST
371 pixst cond2, numbytes2, firstreg2, DST
372 .endif
373 .endm
376 .macro test_bits_1_0_ptr
377 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
378 movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */
379 .else
380 movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
381 .endif
382 .endm
384 .macro test_bits_3_2_ptr
385 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
386 movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */
387 .else
388 movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
389 .endif
390 .endm
392 .macro leading_15bytes process_head, process_tail
393 /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
394 .set DECREMENT_X, 1
395 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
396 .set DECREMENT_X, 0
397 sub X, X, WK0, lsr #dst_bpp_shift
398 str X, [sp, #LINE_SAVED_REG_COUNT*4]
399 mov X, WK0
400 .endif
401 /* Use unaligned loads in all cases for simplicity */
402 .if dst_w_bpp == 8
403 conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
404 .elseif dst_w_bpp == 16
405 test_bits_1_0_ptr
406 conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
407 .endif
408 conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
409 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
410 ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
411 .endif
412 .endm
414 .macro test_bits_3_2_pix
415 movs SCRATCH, X, lsl #dst_bpp_shift+32-3
416 .endm
418 .macro test_bits_1_0_pix
419 .if dst_w_bpp == 8
420 movs SCRATCH, X, lsl #dst_bpp_shift+32-1
421 .else
422 movs SCRATCH, X, lsr #1
423 .endif
424 .endm
426 .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
427 conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
428 .if dst_w_bpp == 16
429 test_bits_1_0_pix
430 conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
431 .elseif dst_w_bpp == 8
432 conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
433 .endif
434 .endm
437 .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
438 110:
439 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
440 .rept pix_per_block*dst_w_bpp/128
441 process_head , 16, 0, unaligned_src, unaligned_mask, 1
442 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
443 preload_middle src_bpp, SRC, 1
444 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
445 preload_middle mask_bpp, MASK, 1
446 .else
447 preload_middle src_bpp, SRC, 0
448 preload_middle mask_bpp, MASK, 0
449 .endif
450 .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
451 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
452 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
453 * preloads for, to achieve staggered prefetches for multiple channels, because there are
454 * always two STMs per prefetch, so there is always an opposite STM on which to put the
455 * preload. Note, no need to BIC the base register here */
456 PF pld, [DST, #32*prefetch_distance - dst_alignment]
457 .endif
458 process_tail , 16, 0
459 .if !((flags) & FLAG_PROCESS_DOES_STORE)
460 pixst , 16, 0, DST
461 .endif
462 .set SUBBLOCK, SUBBLOCK+1
463 .endr
464 subs X, X, #pix_per_block
465 bhs 110b
466 .endm
468 .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
469 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
470 .if dst_r_bpp > 0
471 tst DST, #16
472 bne 111f
473 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
474 b 112f
475 111:
476 .endif
477 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
478 112:
479 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
480 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
481 PF and, WK0, X, #pix_per_block-1
482 .endif
483 preload_trailing src_bpp, src_bpp_shift, SRC
484 preload_trailing mask_bpp, mask_bpp_shift, MASK
485 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
486 preload_trailing dst_r_bpp, dst_bpp_shift, DST
487 .endif
488 add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
489 /* The remainder of the line is handled identically to the medium case */
490 medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
491 .endm
493 .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
494 120:
495 process_head , 16, 0, unaligned_src, unaligned_mask, 0
496 process_tail , 16, 0
497 .if !((flags) & FLAG_PROCESS_DOES_STORE)
498 pixst , 16, 0, DST
499 .endif
500 subs X, X, #128/dst_w_bpp
501 bhs 120b
502 /* Trailing pixels */
503 tst X, #128/dst_w_bpp - 1
504 beq exit_label
505 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
506 .endm
508 .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
509 tst X, #16*8/dst_w_bpp
510 conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
511 /* Trailing pixels */
512 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
513 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
514 .endm
516 .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
517 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
518 .if mask_bpp == 8 || mask_bpp == 16
519 tst MASK, #3
520 bne 141f
521 .endif
522 .if src_bpp == 8 || src_bpp == 16
523 tst SRC, #3
524 bne 140f
525 .endif
526 action process_head, process_tail, process_inner_loop, exit_label, 0, 0
527 .if src_bpp == 8 || src_bpp == 16
528 b exit_label
529 140:
530 action process_head, process_tail, process_inner_loop, exit_label, 1, 0
531 .endif
532 .if mask_bpp == 8 || mask_bpp == 16
533 b exit_label
534 141:
535 .if src_bpp == 8 || src_bpp == 16
536 tst SRC, #3
537 bne 142f
538 .endif
539 action process_head, process_tail, process_inner_loop, exit_label, 0, 1
540 .if src_bpp == 8 || src_bpp == 16
541 b exit_label
542 142:
543 action process_head, process_tail, process_inner_loop, exit_label, 1, 1
544 .endif
545 .endif
546 .endm
549 .macro end_of_line restore_x, vars_spilled, loop_label, last_one
550 .if vars_spilled
551 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
552 /* This is ldmia sp,{} */
553 .word 0xE89D0000 | LINE_SAVED_REGS
554 .endif
555 subs Y, Y, #1
556 .if vars_spilled
557 .if (LINE_SAVED_REGS) & (1<<1)
558 str Y, [sp]
559 .endif
560 .endif
561 add DST, DST, STRIDE_D
562 .if src_bpp > 0
563 add SRC, SRC, STRIDE_S
564 .endif
565 .if mask_bpp > 0
566 add MASK, MASK, STRIDE_M
567 .endif
568 .if restore_x
569 mov X, ORIG_W
570 .endif
571 bhs loop_label
572 .ifc "last_one",""
573 .if vars_spilled
574 b 197f
575 .else
576 b 198f
577 .endif
578 .else
579 .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
580 b 198f
581 .endif
582 .endif
583 .endm
586 .macro generate_composite_function fname, \
587 src_bpp_, \
588 mask_bpp_, \
589 dst_w_bpp_, \
590 flags_, \
591 prefetch_distance_, \
592 init, \
593 newline, \
594 cleanup, \
595 process_head, \
596 process_tail, \
597 process_inner_loop
599 pixman_asm_function fname
602 * Make some macro arguments globally visible and accessible
603 * from other macros
605 .set src_bpp, src_bpp_
606 .set mask_bpp, mask_bpp_
607 .set dst_w_bpp, dst_w_bpp_
608 .set flags, flags_
609 .set prefetch_distance, prefetch_distance_
612 * Select prefetch type for this function.
614 .if prefetch_distance == 0
615 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
616 .else
617 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
618 .endif
620 .if src_bpp == 32
621 .set src_bpp_shift, 2
622 .elseif src_bpp == 24
623 .set src_bpp_shift, 0
624 .elseif src_bpp == 16
625 .set src_bpp_shift, 1
626 .elseif src_bpp == 8
627 .set src_bpp_shift, 0
628 .elseif src_bpp == 0
629 .set src_bpp_shift, -1
630 .else
631 .error "requested src bpp (src_bpp) is not supported"
632 .endif
634 .if mask_bpp == 32
635 .set mask_bpp_shift, 2
636 .elseif mask_bpp == 24
637 .set mask_bpp_shift, 0
638 .elseif mask_bpp == 8
639 .set mask_bpp_shift, 0
640 .elseif mask_bpp == 0
641 .set mask_bpp_shift, -1
642 .else
643 .error "requested mask bpp (mask_bpp) is not supported"
644 .endif
646 .if dst_w_bpp == 32
647 .set dst_bpp_shift, 2
648 .elseif dst_w_bpp == 24
649 .set dst_bpp_shift, 0
650 .elseif dst_w_bpp == 16
651 .set dst_bpp_shift, 1
652 .elseif dst_w_bpp == 8
653 .set dst_bpp_shift, 0
654 .else
655 .error "requested dst bpp (dst_w_bpp) is not supported"
656 .endif
658 .if (((flags) & FLAG_DST_READWRITE) != 0)
659 .set dst_r_bpp, dst_w_bpp
660 .else
661 .set dst_r_bpp, 0
662 .endif
664 .set pix_per_block, 16*8/dst_w_bpp
665 .if src_bpp != 0
666 .if 32*8/src_bpp > pix_per_block
667 .set pix_per_block, 32*8/src_bpp
668 .endif
669 .endif
670 .if mask_bpp != 0
671 .if 32*8/mask_bpp > pix_per_block
672 .set pix_per_block, 32*8/mask_bpp
673 .endif
674 .endif
675 .if dst_r_bpp != 0
676 .if 32*8/dst_r_bpp > pix_per_block
677 .set pix_per_block, 32*8/dst_r_bpp
678 .endif
679 .endif
681 /* The standard entry conditions set up by pixman-arm-common.h are:
682 * r0 = width (pixels)
683 * r1 = height (rows)
684 * r2 = pointer to top-left pixel of destination
685 * r3 = destination stride (pixels)
686 * [sp] = source pixel value, or pointer to top-left pixel of source
687 * [sp,#4] = 0 or source stride (pixels)
688 * The following arguments are unused for non-mask operations
689 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
690 * [sp,#12] = 0 or mask stride (pixels)
694 * Assign symbolic names to registers
696 X .req r0 /* pixels to go on this line */
697 Y .req r1 /* lines to go */
698 DST .req r2 /* destination pixel pointer */
699 STRIDE_D .req r3 /* destination stride (bytes, minus width) */
700 SRC .req r4 /* source pixel pointer */
701 STRIDE_S .req r5 /* source stride (bytes, minus width) */
702 MASK .req r6 /* mask pixel pointer (if applicable) */
703 STRIDE_M .req r7 /* mask stride (bytes, minus width) */
704 WK0 .req r8 /* pixel data registers */
705 WK1 .req r9
706 WK2 .req r10
707 WK3 .req r11
708 SCRATCH .req r12
709 ORIG_W .req r14 /* width (pixels) */
711 push {r4-r11, lr} /* save all registers */
713 subs Y, Y, #1
714 blo 199f
716 #ifdef DEBUG_PARAMS
717 sub sp, sp, #9*4
718 #endif
720 .if src_bpp > 0
721 ldr SRC, [sp, #ARGS_STACK_OFFSET]
722 ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
723 .endif
724 .if mask_bpp > 0
725 ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
726 ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
727 .endif
729 #ifdef DEBUG_PARAMS
730 add Y, Y, #1
731 stmia sp, {r0-r7,pc}
732 sub Y, Y, #1
733 #endif
735 init
737 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
738 /* Reserve a word in which to store X during leading pixels */
739 sub sp, sp, #4
740 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
741 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
742 .endif
744 lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
745 sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
746 .if src_bpp > 0
747 lsl STRIDE_S, #src_bpp_shift
748 sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
749 .endif
750 .if mask_bpp > 0
751 lsl STRIDE_M, #mask_bpp_shift
752 sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
753 .endif
755 /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
756 cmp X, #2*16*8/dst_w_bpp - 1
757 blo 170f
758 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
759 /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
760 cmp X, #(prefetch_distance+3)*pix_per_block - 1
761 blo 160f
763 /* Wide case */
764 /* Adjust X so that the decrement instruction can also test for
765 * inner loop termination. We want it to stop when there are
766 * (prefetch_distance+1) complete blocks to go. */
767 sub X, X, #(prefetch_distance+2)*pix_per_block
768 mov ORIG_W, X
769 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
770 /* This is stmdb sp!,{} */
771 .word 0xE92D0000 | LINE_SAVED_REGS
772 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
773 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
774 .endif
775 151: /* New line */
776 newline
777 preload_leading_step1 src_bpp, WK1, SRC
778 preload_leading_step1 mask_bpp, WK2, MASK
779 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
780 preload_leading_step1 dst_r_bpp, WK3, DST
781 .endif
783 ands WK0, DST, #15
784 beq 154f
785 rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
787 preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
788 preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
789 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
790 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
791 .endif
793 leading_15bytes process_head, process_tail
795 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
796 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
797 and SCRATCH, SRC, #31
798 rsb SCRATCH, SCRATCH, #32*prefetch_distance
799 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
800 and SCRATCH, MASK, #31
801 rsb SCRATCH, SCRATCH, #32*prefetch_distance
802 .endif
803 .ifc "process_inner_loop",""
804 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
805 .else
806 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
807 .endif
809 157: /* Check for another line */
810 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
811 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
812 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
813 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
814 .endif
815 .endif
817 .ltorg
819 160: /* Medium case */
820 mov ORIG_W, X
821 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
822 /* This is stmdb sp!,{} */
823 .word 0xE92D0000 | LINE_SAVED_REGS
824 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
825 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
826 .endif
827 161: /* New line */
828 newline
829 preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
830 preload_line 0, mask_bpp, mask_bpp_shift, MASK
831 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
832 preload_line 0, dst_r_bpp, dst_bpp_shift, DST
833 .endif
835 sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
836 ands WK0, DST, #15
837 beq 164f
838 rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
840 leading_15bytes process_head, process_tail
842 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
843 switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
845 167: /* Check for another line */
846 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
848 .ltorg
850 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
851 .if dst_w_bpp < 32
852 mov ORIG_W, X
853 .endif
854 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
855 /* This is stmdb sp!,{} */
856 .word 0xE92D0000 | LINE_SAVED_REGS
857 .endif
858 171: /* New line */
859 newline
860 preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
861 preload_line 1, mask_bpp, mask_bpp_shift, MASK
862 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
863 preload_line 1, dst_r_bpp, dst_bpp_shift, DST
864 .endif
866 .if dst_w_bpp == 8
867 tst DST, #3
868 beq 174f
869 172: subs X, X, #1
870 blo 177f
871 process_head , 1, 0, 1, 1, 0
872 process_tail , 1, 0
873 .if !((flags) & FLAG_PROCESS_DOES_STORE)
874 pixst , 1, 0, DST
875 .endif
876 tst DST, #3
877 bne 172b
878 .elseif dst_w_bpp == 16
879 tst DST, #2
880 beq 174f
881 subs X, X, #1
882 blo 177f
883 process_head , 2, 0, 1, 1, 0
884 process_tail , 2, 0
885 .if !((flags) & FLAG_PROCESS_DOES_STORE)
886 pixst , 2, 0, DST
887 .endif
888 .endif
890 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
891 switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
893 177: /* Check for another line */
894 end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
895 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
896 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
897 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
898 .endif
900 197:
901 .if (flags) & FLAG_SPILL_LINE_VARS
902 add sp, sp, #LINE_SAVED_REG_COUNT*4
903 .endif
904 198:
905 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
906 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
907 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
908 add sp, sp, #4
909 .endif
911 cleanup
913 #ifdef DEBUG_PARAMS
914 add sp, sp, #9*4 /* junk the debug copy of arguments */
915 #endif
916 199:
917 pop {r4-r11, pc} /* exit */
919 .ltorg
921 .unreq X
922 .unreq Y
923 .unreq DST
924 .unreq STRIDE_D
925 .unreq SRC
926 .unreq STRIDE_S
927 .unreq MASK
928 .unreq STRIDE_M
929 .unreq WK0
930 .unreq WK1
931 .unreq WK2
932 .unreq WK3
933 .unreq SCRATCH
934 .unreq ORIG_W
935 .endfunc
936 .endm
938 .macro line_saved_regs x:vararg
939 .set LINE_SAVED_REGS, 0
940 .set LINE_SAVED_REG_COUNT, 0
941 .irp SAVED_REG,x
942 .ifc "SAVED_REG","Y"
943 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
944 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
945 .endif
946 .ifc "SAVED_REG","STRIDE_D"
947 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
948 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
949 .endif
950 .ifc "SAVED_REG","STRIDE_S"
951 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
952 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
953 .endif
954 .ifc "SAVED_REG","STRIDE_M"
955 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
956 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
957 .endif
958 .ifc "SAVED_REG","ORIG_W"
959 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
960 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
961 .endif
962 .endr
963 .endm
965 .macro nop_macro x:vararg
966 .endm