1 From 78652135f48c6a304fc2e75bc0e440b8b2034a4d Mon Sep 17 00:00:00 2001
2 From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
3 Date: Tue, 23 Feb 2010 23:44:00 +0000
4 Subject: ARM: added 'neon_composite_over_n_8888_8888_ca' fast path
6 This fast path function improves performance of 'firefox-talos-gfx'
9 Benchmark from ARM Cortex-A8 @720MHz
13 [ # ] backend test min(s) median(s) stddev. count
14 [ 0] image firefox-talos-gfx 139.969 141.176 0.35% 6/6
18 [ # ] backend test min(s) median(s) stddev. count
19 [ 0] image firefox-talos-gfx 111.810 112.196 0.23% 6/6
21 diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
22 index 2986884..e90d662 100644
23 --- a/pixman/pixman-arm-neon-asm.S
24 +++ b/pixman/pixman-arm-neon-asm.S
25 @@ -1026,6 +1026,111 @@ generate_composite_function \
27 /******************************************************************************/
29 +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
31 + * 'combine_mask_ca' replacement
33 + * input: solid src (n) in {d8, d9, d10, d11}
34 + * dest in {d4, d5, d6, d7 }
35 + * mask in {d24, d25, d26, d27}
36 + * output: updated src in {d0, d1, d2, d3 }
37 + * updated mask in {d24, d25, d26, d3 }
39 + vmull.u8 q0, d24, d8
40 + vmull.u8 q1, d25, d9
41 + vmull.u8 q6, d26, d10
42 + vmull.u8 q7, d27, d11
43 + vmull.u8 q9, d11, d25
44 + vmull.u8 q12, d11, d24
45 + vmull.u8 q13, d11, d26
46 + vrshr.u16 q8, q0, #8
47 + vrshr.u16 q10, q1, #8
48 + vrshr.u16 q11, q6, #8
49 + vraddhn.u16 d0, q0, q8
50 + vraddhn.u16 d1, q1, q10
51 + vraddhn.u16 d2, q6, q11
52 + vrshr.u16 q11, q12, #8
53 + vrshr.u16 q8, q9, #8
54 + vrshr.u16 q6, q13, #8
55 + vrshr.u16 q10, q7, #8
56 + vraddhn.u16 d24, q12, q11
57 + vraddhn.u16 d25, q9, q8
58 + vraddhn.u16 d26, q13, q6
59 + vraddhn.u16 d3, q7, q10
61 + * 'combine_over_ca' replacement
63 + * output: updated dest in {d28, d29, d30, d31}
67 + vmull.u8 q8, d24, d4
68 + vmull.u8 q9, d25, d5
71 + vmull.u8 q10, d26, d6
72 + vmull.u8 q11, d27, d7
75 +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
76 + /* ... continue 'combine_over_ca' replacement */
77 + vrshr.u16 q14, q8, #8
78 + vrshr.u16 q15, q9, #8
79 + vrshr.u16 q6, q10, #8
80 + vrshr.u16 q7, q11, #8
81 + vraddhn.u16 d28, q14, q8
82 + vraddhn.u16 d29, q15, q9
83 + vraddhn.u16 d30, q6, q10
84 + vraddhn.u16 d31, q7, q11
85 + vqadd.u8 q14, q0, q14
86 + vqadd.u8 q15, q1, q15
89 +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
90 + vrshr.u16 q14, q8, #8
91 + vrshr.u16 q15, q9, #8
92 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
93 + vrshr.u16 q6, q10, #8
94 + vrshr.u16 q7, q11, #8
95 + vraddhn.u16 d28, q14, q8
96 + vraddhn.u16 d29, q15, q9
97 + vraddhn.u16 d30, q6, q10
98 + vraddhn.u16 d31, q7, q11
99 + vld4.8 {d24, d25, d26, d27}, [MASK]!
100 + vqadd.u8 q14, q0, q14
101 + vqadd.u8 q15, q1, q15
103 + pixman_composite_over_n_8888_8888_ca_process_pixblock_head
104 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
107 +.macro pixman_composite_over_n_8888_8888_ca_init
108 + add DUMMY, sp, #ARGS_STACK_OFFSET
110 + vld1.32 {d11[0]}, [DUMMY]
117 +.macro pixman_composite_over_n_8888_8888_ca_cleanup
121 +generate_composite_function \
122 + pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
123 + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
124 + 8, /* number of pixels, processed in a single block */ \
125 + 5, /* prefetch distance */ \
126 + pixman_composite_over_n_8888_8888_ca_init, \
127 + pixman_composite_over_n_8888_8888_ca_cleanup, \
128 + pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
129 + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
130 + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
132 +/******************************************************************************/
134 .macro pixman_composite_add_n_8_8_process_pixblock_head
135 /* expecting source data in {d8, d9, d10, d11} */
136 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
137 diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
138 index 557301e..3f0e18e 100644
139 --- a/pixman/pixman-arm-neon.c
140 +++ b/pixman/pixman-arm-neon.c
141 @@ -269,6 +269,7 @@ BIND_SRC_NULL_DST(over_8888_8888, uint32_t, 1, uint32_t, 1)
143 BIND_N_MASK_DST(over_n_8_0565, uint8_t, 1, uint16_t, 1)
144 BIND_N_MASK_DST(over_n_8_8888, uint8_t, 1, uint32_t, 1)
145 +BIND_N_MASK_DST(over_n_8888_8888_ca, uint32_t, 1, uint32_t, 1)
146 BIND_N_MASK_DST(add_n_8_8, uint8_t, 1, uint8_t, 1)
148 BIND_SRC_N_DST(over_8888_n_8888, uint32_t, 1, uint32_t, 1)
149 @@ -412,6 +413,10 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
150 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, neon_composite_over_n_0565),
151 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, neon_composite_over_n_8888),
152 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, neon_composite_over_n_8888),
153 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca),
154 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
155 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
156 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
157 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, neon_composite_over_8888_n_8888),
158 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, neon_composite_over_8888_n_8888),
159 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, neon_composite_over_8888_8_8888),