1 From 1fba7790367d7b726d05a33bbbcebe10b9280a31 Mon Sep 17 00:00:00 2001
2 From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
3 Date: Mon, 29 Nov 2010 02:10:22 +0200
4 Subject: [PATCH 12/24] ARM: better NEON instructions scheduling for add_8888_8888_8888
6 Provides a minor performance improvement by using pipelining and hiding
7 instructions latencies. Also do not clobber d0-d3 registers (source
8 image pixels) while doing calculations in order to allow the use of
9 the same macro for add_n_8_8888 fast path later.
11 Benchmark from ARM Cortex-A8 @500MHz:
15 add_8888_8888_8888 = L1: 95.94 L2: 42.27 M: 25.60 (121.09%)
16 HT: 14.54 VT: 13.13 R: 12.77 RT: 4.49 (48Kops/s)
17 add_8888_8_8888 = L1: 104.51 L2: 57.81 M: 36.06 (106.62%)
18 HT: 19.24 VT: 16.45 R: 14.71 RT: 4.80 (51Kops/s)
22 add_8888_8888_8888 = L1: 106.66 L2: 47.82 M: 27.32 (129.30%)
23 HT: 15.44 VT: 13.96 R: 12.86 RT: 4.48 (48Kops/s)
24 add_8888_8_8888 = L1: 107.72 L2: 61.02 M: 38.26 (113.16%)
25 HT: 19.48 VT: 16.72 R: 14.82 RT: 4.80 (51Kops/s)
27 pixman/pixman-arm-neon-asm.S | 52 +++++++++++++++++++++++++++--------------
28 1 files changed, 34 insertions(+), 18 deletions(-)
30 diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
31 index 11ef166..829ef84 100644
32 --- a/pixman/pixman-arm-neon-asm.S
33 +++ b/pixman/pixman-arm-neon-asm.S
34 @@ -1542,34 +1542,50 @@ generate_composite_function \
35 /* expecting source data in {d0, d1, d2, d3} */
36 /* destination data in {d4, d5, d6, d7} */
37 /* mask in {d24, d25, d26, d27} */
38 - vmull.u8 q8, d27, d0
39 - vmull.u8 q9, d27, d1
40 + vmull.u8 q8, d27, d0
41 + vmull.u8 q9, d27, d1
44 - vrshr.u16 q0, q8, #8
45 - vrshr.u16 q1, q9, #8
46 - vrshr.u16 q12, q10, #8
47 - vrshr.u16 q13, q11, #8
48 - vraddhn.u16 d0, q0, q8
49 - vraddhn.u16 d1, q1, q9
50 - vraddhn.u16 d2, q12, q10
51 - vraddhn.u16 d3, q13, q11
52 - vqadd.u8 q14, q0, q2
53 - vqadd.u8 q15, q1, q3
54 + /* 1 cycle bubble */
55 + vrsra.u16 q8, q8, #8
56 + vrsra.u16 q9, q9, #8
57 + vrsra.u16 q10, q10, #8
58 + vrsra.u16 q11, q11, #8
61 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
62 + /* 2 cycle bubble */
63 + vrshrn.u16 d28, q8, #8
64 + vrshrn.u16 d29, q9, #8
65 + vrshrn.u16 d30, q10, #8
66 + vrshrn.u16 d31, q11, #8
67 + vqadd.u8 q14, q2, q14
68 + /* 1 cycle bubble */
69 + vqadd.u8 q15, q3, q15
72 -/* TODO: expand macros and do better instructions scheduling */
73 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
74 - pixman_composite_add_8888_8888_8888_process_pixblock_tail
75 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
76 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
79 + vrshrn.u16 d28, q8, #8
81 + vrshrn.u16 d29, q9, #8
82 + vmull.u8 q8, d27, d0
83 + vrshrn.u16 d30, q10, #8
84 + vmull.u8 q9, d27, d1
85 + vrshrn.u16 d31, q11, #8
86 + vmull.u8 q10, d27, d2
87 + vqadd.u8 q14, q2, q14
88 + vmull.u8 q11, d27, d3
89 + vqadd.u8 q15, q3, q15
90 + vrsra.u16 q8, q8, #8
91 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
92 + vrsra.u16 q9, q9, #8
93 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
94 + vrsra.u16 q10, q10, #8
97 - pixman_composite_add_8888_8888_8888_process_pixblock_head
99 + vrsra.u16 q11, q11, #8
102 generate_composite_function \