2 * Copyright © 2012 Raspberry Pi Foundation
3 * Copyright © 2012 RISC OS Open Ltd
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of the copyright holders not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission. The copyright holders make no
12 * representations about the suitability of this software for any purpose. It
13 * is provided "as is" without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Ben Avison (bavison@riscosopen.org)
28 /* Prevent the stack from becoming executable */
29 #if defined(__linux__) && defined(__ELF__)
30 .section .note.GNU-stack,"",%progbits
40 #include "pixman-arm-asm.h"
41 #include "pixman-arm-simd-asm.h"
43 /* A head macro should do all processing which results in an output of up to
44 * 16 bytes, as far as the final load instruction. The corresponding tail macro
45 * should complete the processing of the up-to-16 bytes. The calling macro will
46 * sometimes choose to insert a preload or a decrement of X between them.
47 * cond ARM condition code for code block
48 * numbytes Number of output bytes that should be generated this time
49 * firstreg First WK register in which to place output
50 * unaligned_src Whether to use non-wordaligned loads of source image
51 * unaligned_mask Whether to use non-wordaligned loads of mask image
52 * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
56 line_saved_regs STRIDE_D, STRIDE_S
59 .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
60 pixld cond, numbytes, firstreg, SRC, unaligned_src
63 .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
68 110: pixld , 16, 0, SRC, unaligned_src
69 pixld , 16, 4, SRC, unaligned_src
73 subs X, X, #32*8/src_bpp
81 generate_composite_function \
82 pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
83 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
84 4, /* prefetch distance */ \
86 nop_macro, /* newline */ \
87 nop_macro, /* cleanup */ \
89 nop_macro, /* process tail */ \
92 generate_composite_function \
93 pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
94 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
95 4, /* prefetch distance */ \
97 nop_macro, /* newline */ \
98 nop_macro, /* cleanup */ \
100 nop_macro, /* process tail */ \
103 generate_composite_function \
104 pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
105 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
106 3, /* prefetch distance */ \
108 nop_macro, /* newline */ \
109 nop_macro, /* cleanup */ \
111 nop_macro, /* process tail */ \
114 /******************************************************************************/
116 .macro src_n_8888_init
117 ldr SRC, [sp, #ARGS_STACK_OFFSET]
123 .macro src_n_0565_init
124 ldrh SRC, [sp, #ARGS_STACK_OFFSET]
125 orr SRC, SRC, lsl #16
132 ldrb SRC, [sp, #ARGS_STACK_OFFSET]
134 orr SRC, SRC, lsl #16
140 .macro fill_process_tail cond, numbytes, firstreg
145 pixst cond, numbytes, 4, DST
152 generate_composite_function \
153 pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
154 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
155 0, /* prefetch distance doesn't apply */ \
157 nop_macro, /* newline */ \
158 nop_macro /* cleanup */ \
159 nop_macro /* process head */ \
162 generate_composite_function \
163 pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
164 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
165 0, /* prefetch distance doesn't apply */ \
167 nop_macro, /* newline */ \
168 nop_macro /* cleanup */ \
169 nop_macro /* process head */ \
172 generate_composite_function \
173 pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
174 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
175 0, /* prefetch distance doesn't apply */ \
177 nop_macro, /* newline */ \
178 nop_macro /* cleanup */ \
179 nop_macro /* process head */ \
182 /******************************************************************************/
184 .macro src_x888_8888_pixel, cond, reg
185 orr&cond WK®, WK®, #0xFF000000
188 .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
189 pixld cond, numbytes, firstreg, SRC, unaligned_src
192 .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
193 src_x888_8888_pixel cond, %(firstreg+0)
195 src_x888_8888_pixel cond, %(firstreg+1)
197 src_x888_8888_pixel cond, %(firstreg+2)
198 src_x888_8888_pixel cond, %(firstreg+3)
203 generate_composite_function \
204 pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
205 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
206 3, /* prefetch distance */ \
207 nop_macro, /* init */ \
208 nop_macro, /* newline */ \
209 nop_macro, /* cleanup */ \
210 pixman_composite_src_x888_8888_process_head, \
211 pixman_composite_src_x888_8888_process_tail
213 /******************************************************************************/
215 .macro src_0565_8888_init
216 /* Hold loop invariants in MASK and STRIDE_M */
217 ldr MASK, =0x07E007E0
218 mov STRIDE_M, #0xFF000000
219 /* Set GE[3:0] to 1010 so SEL instructions do what we want */
220 ldr SCRATCH, =0x80008000
221 uadd8 SCRATCH, SCRATCH, SCRATCH
224 .macro src_0565_8888_2pixels, reg1, reg2
225 and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
226 bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
227 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
228 mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000
229 mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
230 bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000
231 orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
232 orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
233 pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
234 sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
235 mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
236 pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
237 sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
238 orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
239 orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
242 /* This version doesn't need STRIDE_M, but is one instruction longer.
243 It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
244 and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
245 bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
246 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
247 mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
248 mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
249 bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
250 mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
251 mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
252 orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
253 orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
254 pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
255 pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
256 sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
257 sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb
258 orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
259 orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
262 .macro src_0565_8888_1pixel, reg
263 bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb
264 and WK®, WK®, MASK @ 000000000000000000000gggggg00000
265 mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
266 mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000
267 orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
268 orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000
269 pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
270 sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
271 orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
274 .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
276 pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
277 .elseif numbytes == 8
278 pixld , 4, firstreg, SRC, unaligned_src
279 .elseif numbytes == 4
280 pixld , 2, firstreg, SRC, unaligned_src
284 .macro src_0565_8888_process_tail cond, numbytes, firstreg
286 src_0565_8888_2pixels firstreg, %(firstreg+1)
287 src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
288 .elseif numbytes == 8
289 src_0565_8888_2pixels firstreg, %(firstreg+1)
291 src_0565_8888_1pixel firstreg
295 generate_composite_function \
296 pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
297 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
298 3, /* prefetch distance */ \
299 src_0565_8888_init, \
300 nop_macro, /* newline */ \
301 nop_macro, /* cleanup */ \
302 src_0565_8888_process_head, \
303 src_0565_8888_process_tail
305 /******************************************************************************/
307 .macro src_x888_0565_init
308 /* Hold loop invariant in MASK */
309 ldr MASK, =0x001F001F
310 line_saved_regs STRIDE_S, ORIG_W
313 .macro src_x888_0565_1pixel s, d
314 and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
315 and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000
316 orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
317 orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
318 /* Top 16 bits are discarded during the following STRH */
321 .macro src_x888_0565_2pixels slo, shi, d, tmp
322 and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
323 and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
324 and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
325 orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
326 orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
327 and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000
328 orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
329 orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
330 pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
333 .macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
339 pixld , 16, 4, SRC, 0
340 src_x888_0565_2pixels 4, 5, 0, 0
342 src_x888_0565_2pixels 6, 7, 1, 1
345 pixld , numbytes*2, 4, SRC, 0
349 .macro src_x888_0565_process_tail cond, numbytes, firstreg
351 src_x888_0565_2pixels 4, 5, 2, 2
352 src_x888_0565_2pixels 6, 7, 3, 4
353 .elseif numbytes == 8
354 src_x888_0565_2pixels 4, 5, 1, 1
355 src_x888_0565_2pixels 6, 7, 2, 2
356 .elseif numbytes == 4
357 src_x888_0565_2pixels 4, 5, 1, 1
359 src_x888_0565_1pixel 4, 1
362 pixst , numbytes, 0, DST
364 pixst , numbytes, 1, DST
372 generate_composite_function \
373 pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
374 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
375 3, /* prefetch distance */ \
376 src_x888_0565_init, \
377 nop_macro, /* newline */ \
378 nop_macro, /* cleanup */ \
379 src_x888_0565_process_head, \
380 src_x888_0565_process_tail
382 /******************************************************************************/
384 .macro add_8_8_8pixels cond, dst1, dst2
385 uqadd8&cond WK&dst1, WK&dst1, MASK
386 uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
389 .macro add_8_8_4pixels cond, dst
390 uqadd8&cond WK&dst, WK&dst, MASK
393 .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
397 pixld cond, 8, 4, SRC, unaligned_src
398 pixld cond, 16, firstreg, DST, 0
399 add_8_8_8pixels cond, firstreg, %(firstreg+1)
400 pixld cond, 8, 4, SRC, unaligned_src
402 pixld cond, numbytes, 4, SRC, unaligned_src
403 pixld cond, numbytes, firstreg, DST, 0
409 .macro add_8_8_process_tail cond, numbytes, firstreg
411 add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
412 .elseif numbytes == 8
413 add_8_8_8pixels cond, firstreg, %(firstreg+1)
415 add_8_8_4pixels cond, firstreg
419 generate_composite_function \
420 pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
421 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
422 2, /* prefetch distance */ \
423 nop_macro, /* init */ \
424 nop_macro, /* newline */ \
425 nop_macro, /* cleanup */ \
426 add_8_8_process_head, \
429 /******************************************************************************/
431 .macro over_8888_8888_init
432 /* Hold loop invariant in MASK */
433 ldr MASK, =0x00800080
434 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
435 uadd8 SCRATCH, MASK, MASK
436 line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
439 .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
444 pixld , numbytes, %(4+firstreg), SRC, unaligned_src
445 pixld , numbytes, firstreg, DST, 0
452 .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
453 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
464 .macro over_8888_8888_prepare next
465 mov WK&next, WK&next, lsr #24
468 .macro over_8888_8888_1pixel src, dst, offset, next
469 /* src = destination component multiplier */
470 rsb WK&src, WK&src, #255
471 /* Split even/odd bytes of dst into SCRATCH/dst */
472 uxtb16 SCRATCH, WK&dst
473 uxtb16 WK&dst, WK&dst, ror #8
474 /* Multiply through, adding 0.5 to the upper byte of result for rounding */
475 mla SCRATCH, SCRATCH, WK&src, MASK
476 mla WK&dst, WK&dst, WK&src, MASK
477 /* Where we would have had a stall between the result of the first MLA and the shifter input,
478 * reload the complete source pixel */
479 ldr WK&src, [SRC, #offset]
480 /* Multiply by 257/256 to approximate 256/255 */
481 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
482 /* In this stall, start processing the next pixel */
484 mov WK&next, WK&next, lsr #24
486 uxtab16 WK&dst, WK&dst, WK&dst, ror #8
487 /* Recombine even/odd bytes of multiplied destination */
488 mov SCRATCH, SCRATCH, ror #8
489 sel WK&dst, SCRATCH, WK&dst
490 /* Saturated add of source to multiplied destination */
491 uqadd8 WK&dst, WK&dst, WK&src
494 .macro over_8888_8888_process_tail cond, numbytes, firstreg
499 over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
501 over_8888_8888_prepare %(4+firstreg)
502 .set PROCESS_REG, firstreg
503 .set PROCESS_OFF, -numbytes
505 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
506 .set PROCESS_REG, PROCESS_REG+1
507 .set PROCESS_OFF, PROCESS_OFF+4
509 pixst , numbytes, firstreg, DST
517 generate_composite_function \
518 pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
519 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
520 2, /* prefetch distance */ \
521 over_8888_8888_init, \
522 nop_macro, /* newline */ \
523 nop_macro, /* cleanup */ \
524 over_8888_8888_process_head, \
525 over_8888_8888_process_tail
527 /******************************************************************************/
529 /* Multiply each byte of a word by a byte.
530 * Useful when there aren't any obvious ways to fill the stalls with other instructions.
531 * word Register containing 4 bytes
532 * byte Register containing byte multiplier (bits 8-31 must be 0)
533 * tmp Scratch register
534 * half Register containing the constant 0x00800080
535 * GE[3:0] bits must contain 0101
537 .macro mul_8888_8 word, byte, tmp, half
538 /* Split even/odd bytes of word apart */
540 uxtb16 word, word, ror #8
541 /* Multiply bytes together with rounding, then by 257/256 */
542 mla tmp, tmp, byte, half
543 mla word, word, byte, half /* 1 stall follows */
544 uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
545 uxtab16 word, word, word, ror #8
546 /* Recombine bytes */
551 /******************************************************************************/
553 .macro over_8888_n_8888_init
554 /* Mask is constant */
555 ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
556 /* Hold loop invariant in STRIDE_M */
557 ldr STRIDE_M, =0x00800080
558 /* We only want the alpha bits of the constant mask */
559 mov MASK, MASK, lsr #24
560 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
561 uadd8 SCRATCH, STRIDE_M, STRIDE_M
562 line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
565 .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
570 pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
571 pixld , numbytes, firstreg, DST, 0
578 .macro over_8888_n_8888_1pixel src, dst
579 mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
580 sub WK7, WK6, WK&src, lsr #24
581 mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
582 uqadd8 WK&dst, WK&dst, WK&src
585 .macro over_8888_n_8888_process_tail cond, numbytes, firstreg
590 over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
593 .set PROCESS_REG, firstreg
595 .if numbytes == 16 && PROCESS_REG == 2
596 /* We're using WK6 and WK7 as temporaries, so half way through
597 * 4 pixels, reload the second two source pixels but this time
598 * into WK4 and WK5 */
599 ldmdb SRC, {WK4, WK5}
601 over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
602 .set PROCESS_REG, PROCESS_REG+1
604 pixst , numbytes, firstreg, DST
612 generate_composite_function \
613 pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
614 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
615 2, /* prefetch distance */ \
616 over_8888_n_8888_init, \
617 nop_macro, /* newline */ \
618 nop_macro, /* cleanup */ \
619 over_8888_n_8888_process_head, \
620 over_8888_n_8888_process_tail
622 /******************************************************************************/
624 .macro over_n_8_8888_init
625 /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
626 ldr SRC, [sp, #ARGS_STACK_OFFSET]
627 /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
628 ldr SCRATCH, =0x00800080
630 uxtb16 SRC, SRC, ror #8
631 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
632 uadd8 SCRATCH, SCRATCH, SCRATCH
633 line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
636 .macro over_n_8_8888_newline
637 ldr STRIDE_D, =0x00800080
643 .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
645 pixld , numbytes/4, 4, MASK, unaligned_mask
646 pixld , numbytes, firstreg, DST, 0
650 .macro over_n_8_8888_1pixel src, dst
651 uxtb Y, WK4, ror #src*8
652 /* Trailing part of multiplication of source */
653 mla SCRATCH, STRIDE_S, Y, STRIDE_D
654 mla Y, SRC, Y, STRIDE_D
656 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
657 uxtab16 Y, Y, Y, ror #8
658 mov SCRATCH, SCRATCH, ror #8
659 sub ORIG_W, ORIG_W, Y, lsr #24
661 /* Then multiply the destination */
662 mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
663 uqadd8 WK&dst, WK&dst, Y
666 .macro over_n_8_8888_process_tail cond, numbytes, firstreg
670 .set PROCESS_REG, firstreg
672 over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
673 .set PROCESS_REG, PROCESS_REG+1
675 pixst , numbytes, firstreg, DST
680 generate_composite_function \
681 pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
682 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
683 2, /* prefetch distance */ \
684 over_n_8_8888_init, \
685 over_n_8_8888_newline, \
686 nop_macro, /* cleanup */ \
687 over_n_8_8888_process_head, \
688 over_n_8_8888_process_tail
690 /******************************************************************************/
692 .macro over_reverse_n_8888_init
693 ldr SRC, [sp, #ARGS_STACK_OFFSET]
694 ldr MASK, =0x00800080
695 /* Split source pixel into RB/AG parts */
697 uxtb16 STRIDE_M, SRC, ror #8
698 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
699 uadd8 SCRATCH, MASK, MASK
700 line_saved_regs STRIDE_D, ORIG_W
703 .macro over_reverse_n_8888_newline
707 .macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
708 pixld , numbytes, firstreg, DST, 0
711 .macro over_reverse_n_8888_1pixel d, is_only
713 beq 8f /* replace with source */
714 bics ORIG_W, STRIDE_D, WK&d, lsr #24
716 beq 49f /* skip store */
718 beq 9f /* write same value back */
720 mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
721 mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */
722 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
723 uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
724 mov SCRATCH, SCRATCH, ror #8
725 sel ORIG_W, SCRATCH, ORIG_W
726 uqadd8 WK&d, WK&d, ORIG_W
732 .macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
734 over_reverse_n_8888_1pixel reg1, 1
736 and SCRATCH, WK®1, WK®2
738 and SCRATCH, SCRATCH, WK®3
739 and SCRATCH, SCRATCH, WK®4
741 mvns SCRATCH, SCRATCH, asr #24
742 beq 49f /* skip store if all opaque */
743 over_reverse_n_8888_1pixel reg1, 0
744 over_reverse_n_8888_1pixel reg2, 0
746 over_reverse_n_8888_1pixel reg3, 0
747 over_reverse_n_8888_1pixel reg4, 0
750 pixst , numbytes, reg1, DST
754 .macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
755 over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
758 generate_composite_function \
759 pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
760 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
761 3, /* prefetch distance */ \
762 over_reverse_n_8888_init, \
763 over_reverse_n_8888_newline, \
764 nop_macro, /* cleanup */ \
765 over_reverse_n_8888_process_head, \
766 over_reverse_n_8888_process_tail
768 /******************************************************************************/
770 .macro over_white_8888_8888_ca_init
777 line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
778 ldr SCRATCH, =0x800080
780 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
781 uadd8 SCRATCH, SCRATCH, SCRATCH
782 .set DST_PRELOAD_BIAS, 8
785 .macro over_white_8888_8888_ca_cleanup
786 .set DST_PRELOAD_BIAS, 0
795 .macro over_white_8888_8888_ca_combine m, d
796 uxtb16 TMP1, TMP0 /* rb_notmask */
797 uxtb16 TMP2, d /* rb_dest; 1 stall follows */
798 smlatt TMP3, TMP2, TMP1, HALF /* red */
799 smlabb TMP2, TMP2, TMP1, HALF /* blue */
800 uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
801 uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
802 smlatt d, TMP1, TMP0, HALF /* alpha */
803 smlabb TMP1, TMP1, TMP0, HALF /* green */
804 pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
805 pkhbt TMP1, TMP1, d, lsl #16 /* ag */
806 uxtab16 TMP0, TMP0, TMP0, ror #8
807 uxtab16 TMP1, TMP1, TMP1, ror #8
808 mov TMP0, TMP0, ror #8
810 uqadd8 d, d, m /* d is a late result */
813 .macro over_white_8888_8888_ca_1pixel_head
814 pixld , 4, 1, MASK, 0
818 .macro over_white_8888_8888_ca_1pixel_tail
820 teq WK1, WK1, asr #32
825 01: over_white_8888_8888_ca_combine WK1, WK3
826 02: pixst , 4, 3, DST
830 .macro over_white_8888_8888_ca_2pixels_head
831 pixld , 8, 1, MASK, 0
834 .macro over_white_8888_8888_ca_2pixels_tail
837 teq WK1, WK1, asr #32
844 01: over_white_8888_8888_ca_combine WK1, WK3
846 teq WK2, WK2, asr #32
850 03: over_white_8888_8888_ca_combine WK2, WK4
851 04: pixst , 8, 3, DST
855 .macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
857 over_white_8888_8888_ca_1pixel_head
860 over_white_8888_8888_ca_2pixels_head
861 over_white_8888_8888_ca_2pixels_tail
863 over_white_8888_8888_ca_2pixels_head
867 .macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
869 over_white_8888_8888_ca_1pixel_tail
871 over_white_8888_8888_ca_2pixels_tail
875 generate_composite_function \
876 pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
877 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
878 2, /* prefetch distance */ \
879 over_white_8888_8888_ca_init, \
880 nop_macro, /* newline */ \
881 over_white_8888_8888_ca_cleanup, \
882 over_white_8888_8888_ca_process_head, \
883 over_white_8888_8888_ca_process_tail
886 .macro over_n_8888_8888_ca_init
887 /* Set up constants. RB_SRC and AG_SRC are in registers;
888 * RB_FLDS, A_SRC, and the two HALF values need to go on the
889 * stack (and the ful SRC value is already there) */
890 ldr SCRATCH, [sp, #ARGS_STACK_OFFSET]
892 orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */
893 mov WK1, #0x80 /* HALF default value */
894 mov WK2, SCRATCH, lsr #24 /* A_SRC */
895 orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
897 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
899 uxtb16 STRIDE_S, SCRATCH, ror #8
901 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
902 uadd8 SCRATCH, WK3, WK3
913 RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */
921 line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
924 .macro over_n_8888_8888_ca_cleanup
926 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
946 .macro over_n_8888_8888_ca_1pixel_head
947 pixld , 4, 6, MASK, 0
951 .macro over_n_8888_8888_ca_1pixel_tail
952 ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
953 uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
954 teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */
957 /* Mask is fully opaque (all channels) */
958 ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
959 eors A_SRC, A_SRC, #0xFF
961 /* Source is also opaque - same as src_8888_8888 */
964 10: /* Same as over_8888_8888 */
965 mul_8888_8 WK0, A_SRC, WK5, HALF
968 20: /* No simplifications possible - do it the hard way */
969 uxtb16 WK2, WK6, ror #8 /* ag_mask */
970 mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */
971 mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */
972 ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
973 uxtb16 WK5, WK0 /* rb_dest */
974 uxtab16 WK3, WK3, WK3, ror #8
975 uxtb16 WK6, WK0, ror #8 /* ag_dest */
976 uxtab16 WK4, WK4, WK4, ror #8
977 smlatt WK0, RB_SRC, WK1, HALF /* red1 */
978 smlabb WK1, RB_SRC, WK1, HALF /* blue1 */
979 bic WK3, RB_FLDS, WK3, lsr #8
980 bic WK4, RB_FLDS, WK4, lsr #8
981 pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */
982 smlatt WK0, WK5, WK3, HALF /* red2 */
983 smlabb WK3, WK5, WK3, HALF /* blue2 */
984 uxtab16 WK1, WK1, WK1, ror #8
985 smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */
986 pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */
987 smlabb WK0, AG_SRC, WK2, HALF /* green1 */
988 smlatt WK2, WK6, WK4, HALF /* alpha2 */
989 smlabb WK4, WK6, WK4, HALF /* green2 */
990 pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */
991 uxtab16 WK3, WK3, WK3, ror #8
992 pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */
993 uxtab16 WK0, WK0, WK0, ror #8
994 uxtab16 WK4, WK4, WK4, ror #8
997 sel WK2, WK1, WK0 /* recombine source*mask */
998 sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */
999 uqadd8 WK0, WK1, WK2 /* followed by 1 stall */
1000 30: /* The destination buffer is already in the L1 cache, so
1001 * there's little point in amalgamating writes */
1006 .macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1007 .rept (numbytes / 4) - 1
1008 over_n_8888_8888_ca_1pixel_head
1009 over_n_8888_8888_ca_1pixel_tail
1011 over_n_8888_8888_ca_1pixel_head
1014 .macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg
1015 over_n_8888_8888_ca_1pixel_tail
1018 pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
1021 beq pixman_composite_over_white_8888_8888_ca_asm_armv6
1022 /* else drop through... */
1024 generate_composite_function \
1025 pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
1026 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
1027 2, /* prefetch distance */ \
1028 over_n_8888_8888_ca_init, \
1029 nop_macro, /* newline */ \
1030 over_n_8888_8888_ca_cleanup, \
1031 over_n_8888_8888_ca_process_head, \
1032 over_n_8888_8888_ca_process_tail
1034 /******************************************************************************/
1036 .macro in_reverse_8888_8888_init
1037 /* Hold loop invariant in MASK */
1038 ldr MASK, =0x00800080
1039 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
1040 uadd8 SCRATCH, MASK, MASK
1041 /* Offset the source pointer: we only need the alpha bytes */
1043 line_saved_regs ORIG_W
1046 .macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
1047 ldrb ORIG_W, [SRC], #4
1049 ldrb WK®1, [SRC], #4
1051 ldrb WK®2, [SRC], #4
1052 ldrb WK®3, [SRC], #4
1055 add DST, DST, #numbytes
1058 .macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1059 in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
1062 .macro in_reverse_8888_8888_1pixel s, d, offset, is_only
1066 ldrb ORIG_W, [SRC, #offset]
1072 uxtb16 SCRATCH, d /* rb_dest */
1073 uxtb16 d, d, ror #8 /* ag_dest */
1074 mla SCRATCH, SCRATCH, s, MASK
1076 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
1077 uxtab16 d, d, d, ror #8
1078 mov SCRATCH, SCRATCH, ror #8
1082 48: /* Last mov d,#0 of the set - used as part of shortcut for
1083 * source values all 0 */
1089 .macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
1091 teq ORIG_W, ORIG_W, asr #32
1092 ldrne WK®1, [DST, #-4]
1093 .elseif numbytes == 8
1095 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
1096 ldmnedb DST, {WK®1-WK®2}
1099 teqeq ORIG_W, WK®2
1100 teqeq ORIG_W, WK®3
1101 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
1102 ldmnedb DST, {WK®1-WK®4}
1104 cmnne DST, #0 /* clear C if NE */
1105 bcs 49f /* no writes to dest if source all -1 */
1106 beq 48f /* set dest to all 0 if source all 0 */
1108 in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1
1109 str WK®1, [DST, #-4]
1110 .elseif numbytes == 8
1111 in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0
1112 in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0
1113 stmdb DST, {WK®1-WK®2}
1115 in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0
1116 in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0
1117 in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0
1118 in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0
1119 stmdb DST, {WK®1-WK®4}
1124 .macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
1125 in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
1128 generate_composite_function \
1129 pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
1130 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
1131 2, /* prefetch distance */ \
1132 in_reverse_8888_8888_init, \
1133 nop_macro, /* newline */ \
1134 nop_macro, /* cleanup */ \
1135 in_reverse_8888_8888_process_head, \
1136 in_reverse_8888_8888_process_tail
1138 /******************************************************************************/
1140 .macro over_n_8888_init
1141 ldr SRC, [sp, #ARGS_STACK_OFFSET]
1142 /* Hold loop invariant in MASK */
1143 ldr MASK, =0x00800080
1144 /* Hold multiplier for destination in STRIDE_M */
1146 sub STRIDE_M, STRIDE_M, SRC, lsr #24
1147 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
1148 uadd8 SCRATCH, MASK, MASK
1151 .macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1152 pixld , numbytes, firstreg, DST, 0
1155 .macro over_n_8888_1pixel dst
1156 mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK
1157 uqadd8 WK&dst, WK&dst, SRC
1160 .macro over_n_8888_process_tail cond, numbytes, firstreg
1161 .set PROCESS_REG, firstreg
1163 over_n_8888_1pixel %(PROCESS_REG)
1164 .set PROCESS_REG, PROCESS_REG+1
1166 pixst , numbytes, firstreg, DST
1169 generate_composite_function \
1170 pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
1171 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
1172 2, /* prefetch distance */ \
1174 nop_macro, /* newline */ \
1175 nop_macro, /* cleanup */ \
1176 over_n_8888_process_head, \
1177 over_n_8888_process_tail
1179 /******************************************************************************/