1 From 98d08b37f17a3379d0ceff8bb7de8f943873fbd8 Mon Sep 17 00:00:00 2001
2 From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
3 Date: Fri, 26 Nov 2010 08:55:49 +0200
4 Subject: [PATCH 05/24] ARM: added 'neon_composite_over_n_8_8' fast path
7 pixman/pixman-arm-neon-asm.S | 68 ++++++++++++++++++++++++++++++++++++++++++
8 pixman/pixman-arm-neon.c | 3 ++
9 2 files changed, 71 insertions(+), 0 deletions(-)
11 diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
12 index 91ec27d..a3875ee 100644
13 --- a/pixman/pixman-arm-neon-asm.S
14 +++ b/pixman/pixman-arm-neon-asm.S
15 @@ -1203,6 +1203,74 @@ generate_composite_function \
17 /******************************************************************************/
19 +.macro pixman_composite_over_n_8_8_process_pixblock_head
20 + vmull.u8 q0, d24, d8
21 + vmull.u8 q1, d25, d8
22 + vmull.u8 q6, d26, d8
23 + vmull.u8 q7, d27, d8
24 + vrshr.u16 q10, q0, #8
25 + vrshr.u16 q11, q1, #8
26 + vrshr.u16 q12, q6, #8
27 + vrshr.u16 q13, q7, #8
28 + vraddhn.u16 d0, q0, q10
29 + vraddhn.u16 d1, q1, q11
30 + vraddhn.u16 d2, q6, q12
31 + vraddhn.u16 d3, q7, q13
34 + vmull.u8 q8, d24, d4
35 + vmull.u8 q9, d25, d5
36 + vmull.u8 q10, d26, d6
37 + vmull.u8 q11, d27, d7
40 +.macro pixman_composite_over_n_8_8_process_pixblock_tail
41 + vrshr.u16 q14, q8, #8
42 + vrshr.u16 q15, q9, #8
43 + vrshr.u16 q12, q10, #8
44 + vrshr.u16 q13, q11, #8
45 + vraddhn.u16 d28, q14, q8
46 + vraddhn.u16 d29, q15, q9
47 + vraddhn.u16 d30, q12, q10
48 + vraddhn.u16 d31, q13, q11
49 + vqadd.u8 q14, q0, q14
50 + vqadd.u8 q15, q1, q15
53 +/* TODO: expand macros and do better instructions scheduling */
54 +.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
55 + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
56 + pixman_composite_over_n_8_8_process_pixblock_tail
57 + vld1.8 {d24, d25, d26, d27}, [MASK]!
58 + cache_preload 32, 32
59 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
60 + pixman_composite_over_n_8_8_process_pixblock_head
63 +.macro pixman_composite_over_n_8_8_init
64 + add DUMMY, sp, #ARGS_STACK_OFFSET
66 + vld1.32 {d8[0]}, [DUMMY]
70 +.macro pixman_composite_over_n_8_8_cleanup
74 +generate_composite_function \
75 + pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
76 + FLAG_DST_READWRITE, \
77 + 32, /* number of pixels, processed in a single block */ \
78 + 5, /* prefetch distance */ \
79 + pixman_composite_over_n_8_8_init, \
80 + pixman_composite_over_n_8_8_cleanup, \
81 + pixman_composite_over_n_8_8_process_pixblock_head, \
82 + pixman_composite_over_n_8_8_process_pixblock_tail, \
83 + pixman_composite_over_n_8_8_process_pixblock_tail_head
85 +/******************************************************************************/
87 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
89 * 'combine_mask_ca' replacement
90 diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
91 index 2f82069..72ef75e 100644
92 --- a/pixman/pixman-arm-neon.c
93 +++ b/pixman/pixman-arm-neon.c
94 @@ -76,6 +76,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8888,
95 uint8_t, 1, uint32_t, 1)
96 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8888_8888_ca,
97 uint32_t, 1, uint32_t, 1)
98 +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8,
99 + uint8_t, 1, uint8_t, 1)
100 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8,
101 uint8_t, 1, uint8_t, 1)
103 @@ -235,6 +237,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
104 PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, x8r8g8b8, neon_composite_src_0888_8888_rev),
105 PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, r5g6b5, neon_composite_src_0888_0565_rev),
106 PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8r8g8b8, neon_composite_src_pixbuf_8888),
107 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
108 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
109 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
110 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, neon_composite_over_n_8_8888),