beta-0.89.2
[luatex.git] / source / libs / pixman / pixman-src / pixman / pixman-arm-simd-asm.S
bloba74a0a8f3460762f15b78f2f8bced177b9d9e395
1 /*
2  * Copyright © 2012 Raspberry Pi Foundation
3  * Copyright © 2012 RISC OS Open Ltd
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of the copyright holders not be used in
10  * advertising or publicity pertaining to distribution of the software without
11  * specific, written prior permission.  The copyright holders make no
12  * representations about the suitability of this software for any purpose.  It
13  * is provided "as is" without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Ben Avison (bavison@riscosopen.org)
25  *
26  */
28 /* Prevent the stack from becoming executable */
29 #if defined(__linux__) && defined(__ELF__)
30 .section .note.GNU-stack,"",%progbits
31 #endif
33         .text
34         .arch armv6
35         .object_arch armv4
36         .arm
37         .altmacro
38         .p2align 2
40 #include "pixman-arm-asm.h"
41 #include "pixman-arm-simd-asm.h"
43 /* A head macro should do all processing which results in an output of up to
44  * 16 bytes, as far as the final load instruction. The corresponding tail macro
45  * should complete the processing of the up-to-16 bytes. The calling macro will
46  * sometimes choose to insert a preload or a decrement of X between them.
47  *   cond           ARM condition code for code block
48  *   numbytes       Number of output bytes that should be generated this time
49  *   firstreg       First WK register in which to place output
50  *   unaligned_src  Whether to use non-wordaligned loads of source image
51  *   unaligned_mask Whether to use non-wordaligned loads of mask image
52  *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
53  */
55 .macro blit_init
56         line_saved_regs STRIDE_D, STRIDE_S
57 .endm
59 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
60         pixld   cond, numbytes, firstreg, SRC, unaligned_src
61 .endm
63 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
64     WK4     .req    STRIDE_D
65     WK5     .req    STRIDE_S
66     WK6     .req    MASK
67     WK7     .req    STRIDE_M
68 110:    pixld   , 16, 0, SRC, unaligned_src
69         pixld   , 16, 4, SRC, unaligned_src
70         pld     [SRC, SCRATCH]
71         pixst   , 16, 0, DST
72         pixst   , 16, 4, DST
73         subs    X, X, #32*8/src_bpp
74         bhs     110b
75     .unreq  WK4
76     .unreq  WK5
77     .unreq  WK6
78     .unreq  WK7
79 .endm
81 generate_composite_function \
82     pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
83     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
84     4, /* prefetch distance */ \
85     blit_init, \
86     nop_macro, /* newline */ \
87     nop_macro, /* cleanup */ \
88     blit_process_head, \
89     nop_macro, /* process tail */ \
90     blit_inner_loop
92 generate_composite_function \
93     pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
94     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
95     4, /* prefetch distance */ \
96     blit_init, \
97     nop_macro, /* newline */ \
98     nop_macro, /* cleanup */ \
99     blit_process_head, \
100     nop_macro, /* process tail */ \
101     blit_inner_loop
103 generate_composite_function \
104     pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
105     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
106     3, /* prefetch distance */ \
107     blit_init, \
108     nop_macro, /* newline */ \
109     nop_macro, /* cleanup */ \
110     blit_process_head, \
111     nop_macro, /* process tail */ \
112     blit_inner_loop
114 /******************************************************************************/
116 .macro src_n_8888_init
117         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
118         mov     STRIDE_S, SRC
119         mov     MASK, SRC
120         mov     STRIDE_M, SRC
121 .endm
123 .macro src_n_0565_init
124         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
125         orr     SRC, SRC, lsl #16
126         mov     STRIDE_S, SRC
127         mov     MASK, SRC
128         mov     STRIDE_M, SRC
129 .endm
131 .macro src_n_8_init
132         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
133         orr     SRC, SRC, lsl #8
134         orr     SRC, SRC, lsl #16
135         mov     STRIDE_S, SRC
136         mov     MASK, SRC
137         mov     STRIDE_M, SRC
138 .endm
140 .macro fill_process_tail  cond, numbytes, firstreg
141     WK4     .req    SRC
142     WK5     .req    STRIDE_S
143     WK6     .req    MASK
144     WK7     .req    STRIDE_M
145         pixst   cond, numbytes, 4, DST
146     .unreq  WK4
147     .unreq  WK5
148     .unreq  WK6
149     .unreq  WK7
150 .endm
152 generate_composite_function \
153     pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
154     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
155     0, /* prefetch distance doesn't apply */ \
156     src_n_8888_init \
157     nop_macro, /* newline */ \
158     nop_macro /* cleanup */ \
159     nop_macro /* process head */ \
160     fill_process_tail
162 generate_composite_function \
163     pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
164     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
165     0, /* prefetch distance doesn't apply */ \
166     src_n_0565_init \
167     nop_macro, /* newline */ \
168     nop_macro /* cleanup */ \
169     nop_macro /* process head */ \
170     fill_process_tail
172 generate_composite_function \
173     pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
174     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
175     0, /* prefetch distance doesn't apply */ \
176     src_n_8_init \
177     nop_macro, /* newline */ \
178     nop_macro /* cleanup */ \
179     nop_macro /* process head */ \
180     fill_process_tail
182 /******************************************************************************/
184 .macro src_x888_8888_pixel, cond, reg
185         orr&cond WK&reg, WK&reg, #0xFF000000
186 .endm
188 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
189         pixld   cond, numbytes, firstreg, SRC, unaligned_src
190 .endm
192 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
193         src_x888_8888_pixel cond, %(firstreg+0)
194  .if numbytes >= 8
195         src_x888_8888_pixel cond, %(firstreg+1)
196   .if numbytes == 16
197         src_x888_8888_pixel cond, %(firstreg+2)
198         src_x888_8888_pixel cond, %(firstreg+3)
199   .endif
200  .endif
201 .endm
203 generate_composite_function \
204     pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
205     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
206     3, /* prefetch distance */ \
207     nop_macro, /* init */ \
208     nop_macro, /* newline */ \
209     nop_macro, /* cleanup */ \
210     pixman_composite_src_x888_8888_process_head, \
211     pixman_composite_src_x888_8888_process_tail
213 /******************************************************************************/
215 .macro src_0565_8888_init
216         /* Hold loop invariants in MASK and STRIDE_M */
217         ldr     MASK, =0x07E007E0
218         mov     STRIDE_M, #0xFF000000
219         /* Set GE[3:0] to 1010 so SEL instructions do what we want */
220         ldr     SCRATCH, =0x80008000
221         uadd8   SCRATCH, SCRATCH, SCRATCH
222 .endm
224 .macro src_0565_8888_2pixels, reg1, reg2
225         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
226         bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
227         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
228         mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
229         mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
230         bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
231         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
232         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
233         pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
234         sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
235         mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
236         pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
237         sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
238         orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
239         orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
240 .endm
242 /* This version doesn't need STRIDE_M, but is one instruction longer.
243    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
244         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
245         bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
246         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
247         mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
248         mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
249         bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
250         mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
251         mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
252         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
253         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
254         pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
255         pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
256         sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
257         sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
258         orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
259         orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
262 .macro src_0565_8888_1pixel, reg
263         bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
264         and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
265         mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
266         mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
267         orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
268         orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
269         pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
270         sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
271         orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
272 .endm
274 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
275  .if numbytes == 16
276         pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
277  .elseif numbytes == 8
278         pixld   , 4, firstreg, SRC, unaligned_src
279  .elseif numbytes == 4
280         pixld   , 2, firstreg, SRC, unaligned_src
281  .endif
282 .endm
284 .macro src_0565_8888_process_tail   cond, numbytes, firstreg
285  .if numbytes == 16
286         src_0565_8888_2pixels firstreg, %(firstreg+1)
287         src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
288  .elseif numbytes == 8
289         src_0565_8888_2pixels firstreg, %(firstreg+1)
290  .else
291         src_0565_8888_1pixel firstreg
292  .endif
293 .endm
295 generate_composite_function \
296     pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
297     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
298     3, /* prefetch distance */ \
299     src_0565_8888_init, \
300     nop_macro, /* newline */ \
301     nop_macro, /* cleanup */ \
302     src_0565_8888_process_head, \
303     src_0565_8888_process_tail
305 /******************************************************************************/
307 .macro src_x888_0565_init
308         /* Hold loop invariant in MASK */
309         ldr     MASK, =0x001F001F
310         line_saved_regs  STRIDE_S, ORIG_W
311 .endm
313 .macro src_x888_0565_1pixel  s, d
314         and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
315         and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000
316         orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb
317         orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
318         /* Top 16 bits are discarded during the following STRH */
319 .endm
321 .macro src_x888_0565_2pixels  slo, shi, d, tmp
322         and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000
323         and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB
324         and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb
325         orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
326         orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB
327         and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000
328         orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
329         orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb
330         pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
331 .endm
333 .macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
334         WK4     .req    STRIDE_S
335         WK5     .req    STRIDE_M
336         WK6     .req    WK3
337         WK7     .req    ORIG_W
338  .if numbytes == 16
339         pixld   , 16, 4, SRC, 0
340         src_x888_0565_2pixels  4, 5, 0, 0
341         pixld   , 8, 4, SRC, 0
342         src_x888_0565_2pixels  6, 7, 1, 1
343         pixld   , 8, 6, SRC, 0
344  .else
345         pixld   , numbytes*2, 4, SRC, 0
346  .endif
347 .endm
349 .macro src_x888_0565_process_tail   cond, numbytes, firstreg
350  .if numbytes == 16
351         src_x888_0565_2pixels  4, 5, 2, 2
352         src_x888_0565_2pixels  6, 7, 3, 4
353  .elseif numbytes == 8
354         src_x888_0565_2pixels  4, 5, 1, 1
355         src_x888_0565_2pixels  6, 7, 2, 2
356  .elseif numbytes == 4
357         src_x888_0565_2pixels  4, 5, 1, 1
358  .else
359         src_x888_0565_1pixel  4, 1
360  .endif
361  .if numbytes == 16
362         pixst   , numbytes, 0, DST
363  .else
364         pixst   , numbytes, 1, DST
365  .endif
366         .unreq  WK4
367         .unreq  WK5
368         .unreq  WK6
369         .unreq  WK7
370 .endm
372 generate_composite_function \
373     pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
374     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
375     3, /* prefetch distance */ \
376     src_x888_0565_init, \
377     nop_macro, /* newline */ \
378     nop_macro, /* cleanup */ \
379     src_x888_0565_process_head, \
380     src_x888_0565_process_tail
382 /******************************************************************************/
384 .macro add_8_8_8pixels  cond, dst1, dst2
385         uqadd8&cond  WK&dst1, WK&dst1, MASK
386         uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
387 .endm
389 .macro add_8_8_4pixels  cond, dst
390         uqadd8&cond  WK&dst, WK&dst, MASK
391 .endm
393 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
394     WK4     .req    MASK
395     WK5     .req    STRIDE_M
396  .if numbytes == 16
397         pixld   cond, 8, 4, SRC, unaligned_src
398         pixld   cond, 16, firstreg, DST, 0
399         add_8_8_8pixels cond, firstreg, %(firstreg+1)
400         pixld   cond, 8, 4, SRC, unaligned_src
401  .else
402         pixld   cond, numbytes, 4, SRC, unaligned_src
403         pixld   cond, numbytes, firstreg, DST, 0
404  .endif
405     .unreq  WK4
406     .unreq  WK5
407 .endm
409 .macro add_8_8_process_tail  cond, numbytes, firstreg
410  .if numbytes == 16
411         add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
412  .elseif numbytes == 8
413         add_8_8_8pixels cond, firstreg, %(firstreg+1)
414  .else
415         add_8_8_4pixels cond, firstreg
416  .endif
417 .endm
419 generate_composite_function \
420     pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
421     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
422     2, /* prefetch distance */ \
423     nop_macro, /* init */ \
424     nop_macro, /* newline */ \
425     nop_macro, /* cleanup */ \
426     add_8_8_process_head, \
427     add_8_8_process_tail
429 /******************************************************************************/
431 .macro over_8888_8888_init
432         /* Hold loop invariant in MASK */
433         ldr     MASK, =0x00800080
434         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
435         uadd8   SCRATCH, MASK, MASK
436         line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
437 .endm
439 .macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
440     WK4     .req    STRIDE_D
441     WK5     .req    STRIDE_S
442     WK6     .req    STRIDE_M
443     WK7     .req    ORIG_W
444         pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
445         pixld   , numbytes, firstreg, DST, 0
446     .unreq  WK4
447     .unreq  WK5
448     .unreq  WK6
449     .unreq  WK7
450 .endm
452 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
453         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
454         teq     WK&reg0, #0
455  .if numbytes > 4
456         teqeq   WK&reg1, #0
457   .if numbytes > 8
458         teqeq   WK&reg2, #0
459         teqeq   WK&reg3, #0
460   .endif
461  .endif
462 .endm
464 .macro over_8888_8888_prepare  next
465         mov     WK&next, WK&next, lsr #24
466 .endm
468 .macro over_8888_8888_1pixel src, dst, offset, next
469         /* src = destination component multiplier */
470         rsb     WK&src, WK&src, #255
471         /* Split even/odd bytes of dst into SCRATCH/dst */
472         uxtb16  SCRATCH, WK&dst
473         uxtb16  WK&dst, WK&dst, ror #8
474         /* Multiply through, adding 0.5 to the upper byte of result for rounding */
475         mla     SCRATCH, SCRATCH, WK&src, MASK
476         mla     WK&dst, WK&dst, WK&src, MASK
477         /* Where we would have had a stall between the result of the first MLA and the shifter input,
478          * reload the complete source pixel */
479         ldr     WK&src, [SRC, #offset]
480         /* Multiply by 257/256 to approximate 256/255 */
481         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
482         /* In this stall, start processing the next pixel */
483  .if offset < -4
484         mov     WK&next, WK&next, lsr #24
485  .endif
486         uxtab16 WK&dst, WK&dst, WK&dst, ror #8
487         /* Recombine even/odd bytes of multiplied destination */
488         mov     SCRATCH, SCRATCH, ror #8
489         sel     WK&dst, SCRATCH, WK&dst
490         /* Saturated add of source to multiplied destination */
491         uqadd8  WK&dst, WK&dst, WK&src
492 .endm
494 .macro over_8888_8888_process_tail  cond, numbytes, firstreg
495     WK4     .req    STRIDE_D
496     WK5     .req    STRIDE_S
497     WK6     .req    STRIDE_M
498     WK7     .req    ORIG_W
499         over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
500         beq     10f
501         over_8888_8888_prepare  %(4+firstreg)
502  .set PROCESS_REG, firstreg
503  .set PROCESS_OFF, -numbytes
504  .rept numbytes / 4
505         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
506   .set PROCESS_REG, PROCESS_REG+1
507   .set PROCESS_OFF, PROCESS_OFF+4
508  .endr
509         pixst   , numbytes, firstreg, DST
511     .unreq  WK4
512     .unreq  WK5
513     .unreq  WK6
514     .unreq  WK7
515 .endm
517 generate_composite_function \
518     pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
519     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
520     2, /* prefetch distance */ \
521     over_8888_8888_init, \
522     nop_macro, /* newline */ \
523     nop_macro, /* cleanup */ \
524     over_8888_8888_process_head, \
525     over_8888_8888_process_tail
527 /******************************************************************************/
529 /* Multiply each byte of a word by a byte.
530  * Useful when there aren't any obvious ways to fill the stalls with other instructions.
531  * word  Register containing 4 bytes
532  * byte  Register containing byte multiplier (bits 8-31 must be 0)
533  * tmp   Scratch register
534  * half  Register containing the constant 0x00800080
535  * GE[3:0] bits must contain 0101
536  */
537 .macro mul_8888_8  word, byte, tmp, half
538         /* Split even/odd bytes of word apart */
539         uxtb16  tmp, word
540         uxtb16  word, word, ror #8
541         /* Multiply bytes together with rounding, then by 257/256 */
542         mla     tmp, tmp, byte, half
543         mla     word, word, byte, half /* 1 stall follows */
544         uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
545         uxtab16 word, word, word, ror #8
546         /* Recombine bytes */
547         mov     tmp, tmp, ror #8
548         sel     word, tmp, word
549 .endm
551 /******************************************************************************/
553 .macro over_8888_n_8888_init
554         /* Mask is constant */
555         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
556         /* Hold loop invariant in STRIDE_M */
557         ldr     STRIDE_M, =0x00800080
558         /* We only want the alpha bits of the constant mask */
559         mov     MASK, MASK, lsr #24
560         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
561         uadd8   SCRATCH, STRIDE_M, STRIDE_M
562         line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
563 .endm
565 .macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
566     WK4     .req    Y
567     WK5     .req    STRIDE_D
568     WK6     .req    STRIDE_S
569     WK7     .req    ORIG_W
570         pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
571         pixld   , numbytes, firstreg, DST, 0
572     .unreq  WK4
573     .unreq  WK5
574     .unreq  WK6
575     .unreq  WK7
576 .endm
578 .macro over_8888_n_8888_1pixel src, dst
579         mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
580         sub     WK7, WK6, WK&src, lsr #24
581         mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
582         uqadd8  WK&dst, WK&dst, WK&src
583 .endm
585 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
586     WK4     .req    Y
587     WK5     .req    STRIDE_D
588     WK6     .req    STRIDE_S
589     WK7     .req    ORIG_W
590         over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
591         beq     10f
592         mov     WK6, #255
593  .set PROCESS_REG, firstreg
594  .rept numbytes / 4
595   .if numbytes == 16 && PROCESS_REG == 2
596         /* We're using WK6 and WK7 as temporaries, so half way through
597          * 4 pixels, reload the second two source pixels but this time
598          * into WK4 and WK5 */
599         ldmdb   SRC, {WK4, WK5}
600   .endif
601         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
602   .set PROCESS_REG, PROCESS_REG+1
603  .endr
604         pixst   , numbytes, firstreg, DST
606     .unreq  WK4
607     .unreq  WK5
608     .unreq  WK6
609     .unreq  WK7
610 .endm
612 generate_composite_function \
613     pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
614     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
615     2, /* prefetch distance */ \
616     over_8888_n_8888_init, \
617     nop_macro, /* newline */ \
618     nop_macro, /* cleanup */ \
619     over_8888_n_8888_process_head, \
620     over_8888_n_8888_process_tail
622 /******************************************************************************/
624 .macro over_n_8_8888_init
625         /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
626         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
627         /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
628         ldr     SCRATCH, =0x00800080
629         uxtb16  STRIDE_S, SRC
630         uxtb16  SRC, SRC, ror #8
631         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
632         uadd8   SCRATCH, SCRATCH, SCRATCH
633         line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
634 .endm
636 .macro over_n_8_8888_newline
637         ldr     STRIDE_D, =0x00800080
638         b       1f
639  .ltorg
641 .endm
643 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
644     WK4     .req    STRIDE_M
645         pixld   , numbytes/4, 4, MASK, unaligned_mask
646         pixld   , numbytes, firstreg, DST, 0
647     .unreq  WK4
648 .endm
650 .macro over_n_8_8888_1pixel src, dst
651         uxtb    Y, WK4, ror #src*8
652         /* Trailing part of multiplication of source */
653         mla     SCRATCH, STRIDE_S, Y, STRIDE_D
654         mla     Y, SRC, Y, STRIDE_D
655         mov     ORIG_W, #255
656         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
657         uxtab16 Y, Y, Y, ror #8
658         mov     SCRATCH, SCRATCH, ror #8
659         sub     ORIG_W, ORIG_W, Y, lsr #24
660         sel     Y, SCRATCH, Y
661         /* Then multiply the destination */
662         mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
663         uqadd8  WK&dst, WK&dst, Y
664 .endm
666 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
667     WK4     .req    STRIDE_M
668         teq     WK4, #0
669         beq     10f
670  .set PROCESS_REG, firstreg
671  .rept numbytes / 4
672         over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
673   .set PROCESS_REG, PROCESS_REG+1
674  .endr
675         pixst   , numbytes, firstreg, DST
677     .unreq  WK4
678 .endm
680 generate_composite_function \
681     pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
682     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
683     2, /* prefetch distance */ \
684     over_n_8_8888_init, \
685     over_n_8_8888_newline, \
686     nop_macro, /* cleanup */ \
687     over_n_8_8888_process_head, \
688     over_n_8_8888_process_tail
690 /******************************************************************************/
692 .macro over_reverse_n_8888_init
693         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
694         ldr     MASK, =0x00800080
695         /* Split source pixel into RB/AG parts */
696         uxtb16  STRIDE_S, SRC
697         uxtb16  STRIDE_M, SRC, ror #8
698         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
699         uadd8   SCRATCH, MASK, MASK
700         line_saved_regs  STRIDE_D, ORIG_W
701 .endm
703 .macro over_reverse_n_8888_newline
704         mov     STRIDE_D, #0xFF
705 .endm
707 .macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
708         pixld   , numbytes, firstreg, DST, 0
709 .endm
711 .macro over_reverse_n_8888_1pixel  d, is_only
712         teq     WK&d, #0
713         beq     8f       /* replace with source */
714         bics    ORIG_W, STRIDE_D, WK&d, lsr #24
715  .if is_only == 1
716         beq     49f      /* skip store */
717  .else
718         beq     9f       /* write same value back */
719  .endif
720         mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
721         mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */
722         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
723         uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
724         mov     SCRATCH, SCRATCH, ror #8
725         sel     ORIG_W, SCRATCH, ORIG_W
726         uqadd8  WK&d, WK&d, ORIG_W
727         b       9f
728 8:      mov     WK&d, SRC
730 .endm
732 .macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
733  .if numbytes == 4
734         over_reverse_n_8888_1pixel  reg1, 1
735  .else
736         and     SCRATCH, WK&reg1, WK&reg2
737   .if numbytes == 16
738         and     SCRATCH, SCRATCH, WK&reg3
739         and     SCRATCH, SCRATCH, WK&reg4
740   .endif
741         mvns    SCRATCH, SCRATCH, asr #24
742         beq     49f /* skip store if all opaque */
743         over_reverse_n_8888_1pixel  reg1, 0
744         over_reverse_n_8888_1pixel  reg2, 0
745   .if numbytes == 16
746         over_reverse_n_8888_1pixel  reg3, 0
747         over_reverse_n_8888_1pixel  reg4, 0
748   .endif
749  .endif
750         pixst   , numbytes, reg1, DST
752 .endm
754 .macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
755         over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
756 .endm
758 generate_composite_function \
759     pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
760     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
761     3, /* prefetch distance */ \
762     over_reverse_n_8888_init, \
763     over_reverse_n_8888_newline, \
764     nop_macro, /* cleanup */ \
765     over_reverse_n_8888_process_head, \
766     over_reverse_n_8888_process_tail
768 /******************************************************************************/
770 .macro over_white_8888_8888_ca_init
771         HALF    .req    SRC
772         TMP0    .req    STRIDE_D
773         TMP1    .req    STRIDE_S
774         TMP2    .req    STRIDE_M
775         TMP3    .req    ORIG_W
776         WK4     .req    SCRATCH
777         line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
778         ldr     SCRATCH, =0x800080
779         mov     HALF, #0x80
780         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
781         uadd8   SCRATCH, SCRATCH, SCRATCH
782         .set DST_PRELOAD_BIAS, 8
783 .endm
785 .macro over_white_8888_8888_ca_cleanup
786         .set DST_PRELOAD_BIAS, 0
787         .unreq  HALF
788         .unreq  TMP0
789         .unreq  TMP1
790         .unreq  TMP2
791         .unreq  TMP3
792         .unreq  WK4
793 .endm
795 .macro over_white_8888_8888_ca_combine  m, d
796         uxtb16  TMP1, TMP0                /* rb_notmask */
797         uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */
798         smlatt  TMP3, TMP2, TMP1, HALF    /* red */
799         smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
800         uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
801         uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */
802         smlatt  d, TMP1, TMP0, HALF       /* alpha */
803         smlabb  TMP1, TMP1, TMP0, HALF    /* green */
804         pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
805         pkhbt   TMP1, TMP1, d, lsl #16    /* ag */
806         uxtab16 TMP0, TMP0, TMP0, ror #8
807         uxtab16 TMP1, TMP1, TMP1, ror #8
808         mov     TMP0, TMP0, ror #8
809         sel     d, TMP0, TMP1
810         uqadd8  d, d, m                   /* d is a late result */
811 .endm
813 .macro over_white_8888_8888_ca_1pixel_head
814         pixld   , 4, 1, MASK, 0
815         pixld   , 4, 3, DST, 0
816 .endm
818 .macro over_white_8888_8888_ca_1pixel_tail
819         mvn     TMP0, WK1
820         teq     WK1, WK1, asr #32
821         bne     01f
822         bcc     03f
823         mov     WK3, WK1
824         b       02f
825 01:     over_white_8888_8888_ca_combine WK1, WK3
826 02:     pixst   , 4, 3, DST
828 .endm
830 .macro over_white_8888_8888_ca_2pixels_head
831         pixld   , 8, 1, MASK, 0
832 .endm
834 .macro over_white_8888_8888_ca_2pixels_tail
835         pixld   , 8, 3, DST
836         mvn     TMP0, WK1
837         teq     WK1, WK1, asr #32
838         bne     01f
839         movcs   WK3, WK1
840         bcs     02f
841         teq     WK2, #0
842         beq     05f
843         b       02f
844 01:     over_white_8888_8888_ca_combine WK1, WK3
845 02:     mvn     TMP0, WK2
846         teq     WK2, WK2, asr #32
847         bne     03f
848         movcs   WK4, WK2
849         b       04f
850 03:     over_white_8888_8888_ca_combine WK2, WK4
851 04:     pixst   , 8, 3, DST
853 .endm
855 .macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
856  .if numbytes == 4
857         over_white_8888_8888_ca_1pixel_head
858  .else
859   .if numbytes == 16
860         over_white_8888_8888_ca_2pixels_head
861         over_white_8888_8888_ca_2pixels_tail
862   .endif
863         over_white_8888_8888_ca_2pixels_head
864  .endif
865 .endm
867 .macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
868  .if numbytes == 4
869         over_white_8888_8888_ca_1pixel_tail
870  .else
871         over_white_8888_8888_ca_2pixels_tail
872  .endif
873 .endm
875 generate_composite_function \
876     pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
877     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
878     2, /* prefetch distance */ \
879     over_white_8888_8888_ca_init, \
880     nop_macro, /* newline */ \
881     over_white_8888_8888_ca_cleanup, \
882     over_white_8888_8888_ca_process_head, \
883     over_white_8888_8888_ca_process_tail
886 .macro over_n_8888_8888_ca_init
887         /* Set up constants. RB_SRC and AG_SRC are in registers;
888          * RB_FLDS, A_SRC, and the two HALF values need to go on the
889          * stack (and the ful SRC value is already there) */
890         ldr     SCRATCH, [sp, #ARGS_STACK_OFFSET]
891         mov     WK0, #0x00FF0000
892         orr     WK0, WK0, #0xFF        /* RB_FLDS (0x00FF00FF) */
893         mov     WK1, #0x80             /* HALF default value */
894         mov     WK2, SCRATCH, lsr #24  /* A_SRC */
895         orr     WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
896         push    {WK0-WK3}
897  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
898         uxtb16  SRC, SCRATCH
899         uxtb16  STRIDE_S, SCRATCH, ror #8
901         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
902         uadd8   SCRATCH, WK3, WK3
904         .unreq  WK0
905         .unreq  WK1
906         .unreq  WK2
907         .unreq  WK3
908         WK0     .req    Y
909         WK1     .req    STRIDE_D
910         RB_SRC  .req    SRC
911         AG_SRC  .req    STRIDE_S
912         WK2     .req    STRIDE_M
913         RB_FLDS .req    r8       /* the reloaded constants have to be at consecutive registers starting at an even one */
914         A_SRC   .req    r8
915         HALF    .req    r9
916         WK3     .req    r10
917         WK4     .req    r11
918         WK5     .req    SCRATCH
919         WK6     .req    ORIG_W
921         line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
922 .endm
924 .macro over_n_8888_8888_ca_cleanup
925         add     sp, sp, #16
926  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
928         .unreq  WK0
929         .unreq  WK1
930         .unreq  RB_SRC
931         .unreq  AG_SRC
932         .unreq  WK2
933         .unreq  RB_FLDS
934         .unreq  A_SRC
935         .unreq  HALF
936         .unreq  WK3
937         .unreq  WK4
938         .unreq  WK5
939         .unreq  WK6
940         WK0     .req    r8
941         WK1     .req    r9
942         WK2     .req    r10
943         WK3     .req    r11
944 .endm
946 .macro over_n_8888_8888_ca_1pixel_head
947         pixld   , 4, 6, MASK, 0
948         pixld   , 4, 0, DST, 0
949 .endm
951 .macro over_n_8888_8888_ca_1pixel_tail
952         ldrd    A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
953         uxtb16  WK1, WK6                 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
954         teq     WK6, WK6, asr #32        /* Zc if transparent, ZC if opaque */
955         bne     20f
956         bcc     40f
957         /* Mask is fully opaque (all channels) */
958         ldr     WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
959         eors    A_SRC, A_SRC, #0xFF
960         bne     10f
961         /* Source is also opaque - same as src_8888_8888 */
962         mov     WK0, WK6
963         b       30f
964 10:     /* Same as over_8888_8888 */
965         mul_8888_8 WK0, A_SRC, WK5, HALF
966         uqadd8  WK0, WK0, WK6
967         b       30f
968 20:     /* No simplifications possible - do it the hard way */
969         uxtb16  WK2, WK6, ror #8         /* ag_mask */
970         mla     WK3, WK1, A_SRC, HALF    /* rb_mul; 2 cycles */
971         mla     WK4, WK2, A_SRC, HALF    /* ag_mul; 2 cycles */
972         ldrd    RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
973         uxtb16  WK5, WK0                 /* rb_dest */
974         uxtab16 WK3, WK3, WK3, ror #8
975         uxtb16  WK6, WK0, ror #8         /* ag_dest */
976         uxtab16 WK4, WK4, WK4, ror #8
977         smlatt  WK0, RB_SRC, WK1, HALF   /* red1 */
978         smlabb  WK1, RB_SRC, WK1, HALF   /* blue1 */
979         bic     WK3, RB_FLDS, WK3, lsr #8
980         bic     WK4, RB_FLDS, WK4, lsr #8
981         pkhbt   WK1, WK1, WK0, lsl #16   /* rb1 */
982         smlatt  WK0, WK5, WK3, HALF      /* red2 */
983         smlabb  WK3, WK5, WK3, HALF      /* blue2 */
984         uxtab16 WK1, WK1, WK1, ror #8
985         smlatt  WK5, AG_SRC, WK2, HALF   /* alpha1 */
986         pkhbt   WK3, WK3, WK0, lsl #16   /* rb2 */
987         smlabb  WK0, AG_SRC, WK2, HALF   /* green1 */
988         smlatt  WK2, WK6, WK4, HALF      /* alpha2 */
989         smlabb  WK4, WK6, WK4, HALF      /* green2 */
990         pkhbt   WK0, WK0, WK5, lsl #16   /* ag1 */
991         uxtab16 WK3, WK3, WK3, ror #8
992         pkhbt   WK4, WK4, WK2, lsl #16   /* ag2 */
993         uxtab16 WK0, WK0, WK0, ror #8
994         uxtab16 WK4, WK4, WK4, ror #8
995         mov     WK1, WK1, ror #8
996         mov     WK3, WK3, ror #8
997         sel     WK2, WK1, WK0            /* recombine source*mask */
998         sel     WK1, WK3, WK4            /* recombine dest*(1-source_alpha*mask) */
999         uqadd8  WK0, WK1, WK2            /* followed by 1 stall */
1000 30:     /* The destination buffer is already in the L1 cache, so
1001          * there's little point in amalgamating writes */
1002         pixst   , 4, 0, DST
1004 .endm
1006 .macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1007  .rept (numbytes / 4) - 1
1008         over_n_8888_8888_ca_1pixel_head
1009         over_n_8888_8888_ca_1pixel_tail
1010  .endr
1011         over_n_8888_8888_ca_1pixel_head
1012 .endm
1014 .macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg
1015         over_n_8888_8888_ca_1pixel_tail
1016 .endm
1018 pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
1019         ldr     ip, [sp]
1020         cmp     ip, #-1
1021         beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
1022         /* else drop through... */
1023  .endfunc
1024 generate_composite_function \
1025     pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
1026     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
1027     2, /* prefetch distance */ \
1028     over_n_8888_8888_ca_init, \
1029     nop_macro, /* newline */ \
1030     over_n_8888_8888_ca_cleanup, \
1031     over_n_8888_8888_ca_process_head, \
1032     over_n_8888_8888_ca_process_tail
1034 /******************************************************************************/
1036 .macro in_reverse_8888_8888_init
1037         /* Hold loop invariant in MASK */
1038         ldr     MASK, =0x00800080
1039         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
1040         uadd8   SCRATCH, MASK, MASK
1041         /* Offset the source pointer: we only need the alpha bytes */
1042         add     SRC, SRC, #3
1043         line_saved_regs  ORIG_W
1044 .endm
1046 .macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
1047         ldrb    ORIG_W, [SRC], #4
1048  .if numbytes >= 8
1049         ldrb    WK&reg1, [SRC], #4
1050   .if numbytes == 16
1051         ldrb    WK&reg2, [SRC], #4
1052         ldrb    WK&reg3, [SRC], #4
1053   .endif
1054  .endif
1055         add     DST, DST, #numbytes
1056 .endm
1058 .macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1059         in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)
1060 .endm
1062 .macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
1063  .if is_only != 1
1064         movs    s, ORIG_W
1065   .if offset != 0
1066         ldrb    ORIG_W, [SRC, #offset]
1067   .endif
1068         beq     01f
1069         teq     STRIDE_M, #0xFF
1070         beq     02f
1071  .endif
1072         uxtb16  SCRATCH, d                 /* rb_dest */
1073         uxtb16  d, d, ror #8               /* ag_dest */
1074         mla     SCRATCH, SCRATCH, s, MASK
1075         mla     d, d, s, MASK
1076         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
1077         uxtab16 d, d, d, ror #8
1078         mov     SCRATCH, SCRATCH, ror #8
1079         sel     d, SCRATCH, d
1080         b       02f
1081  .if offset == 0
1082 48:     /* Last mov d,#0 of the set - used as part of shortcut for
1083          * source values all 0 */
1084  .endif
1085 01:     mov     d, #0
1087 .endm
1089 .macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
1090  .if numbytes == 4
1091         teq     ORIG_W, ORIG_W, asr #32
1092         ldrne   WK&reg1, [DST, #-4]
1093  .elseif numbytes == 8
1094         teq     ORIG_W, WK&reg1
1095         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
1096         ldmnedb DST, {WK&reg1-WK&reg2}
1097  .else
1098         teq     ORIG_W, WK&reg1
1099         teqeq   ORIG_W, WK&reg2
1100         teqeq   ORIG_W, WK&reg3
1101         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
1102         ldmnedb DST, {WK&reg1-WK&reg4}
1103  .endif
1104         cmnne   DST, #0   /* clear C if NE */
1105         bcs     49f       /* no writes to dest if source all -1 */
1106         beq     48f       /* set dest to all 0 if source all 0 */
1107  .if numbytes == 4
1108         in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1
1109         str     WK&reg1, [DST, #-4]
1110  .elseif numbytes == 8
1111         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0
1112         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0
1113         stmdb   DST, {WK&reg1-WK&reg2}
1114  .else
1115         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0
1116         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0
1117         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0
1118         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0
1119         stmdb   DST, {WK&reg1-WK&reg4}
1120  .endif
1122 .endm
1124 .macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
1125         in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
1126 .endm
1128 generate_composite_function \
1129     pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
1130     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
1131     2, /* prefetch distance */ \
1132     in_reverse_8888_8888_init, \
1133     nop_macro, /* newline */ \
1134     nop_macro, /* cleanup */ \
1135     in_reverse_8888_8888_process_head, \
1136     in_reverse_8888_8888_process_tail
1138 /******************************************************************************/
1140 .macro over_n_8888_init
1141         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
1142         /* Hold loop invariant in MASK */
1143         ldr     MASK, =0x00800080
1144         /* Hold multiplier for destination in STRIDE_M */
1145         mov     STRIDE_M, #255
1146         sub     STRIDE_M, STRIDE_M, SRC, lsr #24
1147         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
1148         uadd8   SCRATCH, MASK, MASK
1149 .endm
1151 .macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1152         pixld   , numbytes, firstreg, DST, 0
1153 .endm
1155 .macro over_n_8888_1pixel dst
1156         mul_8888_8  WK&dst, STRIDE_M, SCRATCH, MASK
1157         uqadd8  WK&dst, WK&dst, SRC
1158 .endm
1160 .macro over_n_8888_process_tail  cond, numbytes, firstreg
1161  .set PROCESS_REG, firstreg
1162  .rept numbytes / 4
1163         over_n_8888_1pixel %(PROCESS_REG)
1164   .set PROCESS_REG, PROCESS_REG+1
1165  .endr
1166         pixst   , numbytes, firstreg, DST
1167 .endm
1169 generate_composite_function \
1170     pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
1171     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
1172     2, /* prefetch distance */ \
1173     over_n_8888_init, \
1174     nop_macro, /* newline */ \
1175     nop_macro, /* cleanup */ \
1176     over_n_8888_process_head, \
1177     over_n_8888_process_tail
1179 /******************************************************************************/