CLOSED TREE: TraceMonkey merge head. (a=blockers)
[mozilla-central.git] / gfx / ycbcr / yuv_convert_arm.cpp
blob64e7baa5a96ca16e0731fe50777cc13e7cd6ca06
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
7 #include "yuv_convert.h"
9 void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag)
11 static __attribute__((aligned(16))) uint16 acc_r[8] = {
12 22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
14 static __attribute__((aligned(16))) uint16 acc_g[8] = {
15 17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
17 static __attribute__((aligned(16))) uint16 acc_b[8] = {
18 28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
21 * Registers:
22 * q0, q1 : d0, d1, d2, d3 - are used for initial loading of YUV data
23 * q2 : d4, d5 - are used for storing converted RGB data
24 * q3 : d6, d7 - are used for temporary storage
26 * q4-q7 - reserved
28 * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data
29 * q10 : d20, d21
30 * q11 : d22, d23
31 * q12 : d24, d25
32 * q13 : d26, d27
33 * q13, q14, q15 - various constants (#16, #149, #204, #50, #104, #154)
35 asm volatile (
36 ".fpu neon\n"
37 ".macro convert_macroblock size\n"
38 /* load up to 16 source pixels */
39 ".if \\size == 16\n"
40 "pld [%[y], #64]\n"
41 "pld [%[u], #64]\n"
42 "pld [%[v], #64]\n"
43 "vld1.8 {d1}, [%[y]]!\n"
44 "vld1.8 {d3}, [%[y]]!\n"
45 "vld1.8 {d0}, [%[u]]!\n"
46 "vld1.8 {d2}, [%[v]]!\n"
47 ".elseif \\size == 8\n"
48 "vld1.8 {d1}, [%[y]]!\n"
49 "vld1.8 {d0[0]}, [%[u]]!\n"
50 "vld1.8 {d0[1]}, [%[u]]!\n"
51 "vld1.8 {d0[2]}, [%[u]]!\n"
52 "vld1.8 {d0[3]}, [%[u]]!\n"
53 "vld1.8 {d2[0]}, [%[v]]!\n"
54 "vld1.8 {d2[1]}, [%[v]]!\n"
55 "vld1.8 {d2[2]}, [%[v]]!\n"
56 "vld1.8 {d2[3]}, [%[v]]!\n"
57 ".elseif \\size == 4\n"
58 "vld1.8 {d1[0]}, [%[y]]!\n"
59 "vld1.8 {d1[1]}, [%[y]]!\n"
60 "vld1.8 {d1[2]}, [%[y]]!\n"
61 "vld1.8 {d1[3]}, [%[y]]!\n"
62 "vld1.8 {d0[0]}, [%[u]]!\n"
63 "vld1.8 {d0[1]}, [%[u]]!\n"
64 "vld1.8 {d2[0]}, [%[v]]!\n"
65 "vld1.8 {d2[1]}, [%[v]]!\n"
66 ".elseif \\size == 2\n"
67 "vld1.8 {d1[0]}, [%[y]]!\n"
68 "vld1.8 {d1[1]}, [%[y]]!\n"
69 "vld1.8 {d0[0]}, [%[u]]!\n"
70 "vld1.8 {d2[0]}, [%[v]]!\n"
71 ".elseif \\size == 1\n"
72 "vld1.8 {d1[0]}, [%[y]]!\n"
73 "vld1.8 {d0[0]}, [%[u]]!\n"
74 "vld1.8 {d2[0]}, [%[v]]!\n"
75 ".else\n"
76 ".error \"unsupported macroblock size\"\n"
77 ".endif\n"
79 /* d1 - Y data (first 8 bytes) */
80 /* d3 - Y data (next 8 bytes) */
81 /* d0 - U data, d2 - V data */
83 /* split even and odd Y color components */
84 "vuzp.8 d1, d3\n" /* d1 - evenY, d3 - oddY */
85 /* clip upper and lower boundaries */
86 "vqadd.u8 q0, q0, q4\n"
87 "vqadd.u8 q1, q1, q4\n"
88 "vqsub.u8 q0, q0, q5\n"
89 "vqsub.u8 q1, q1, q5\n"
91 "vshr.u8 d4, d2, #1\n" /* d4 = V >> 1 */
93 "vmull.u8 q8, d1, d27\n" /* q8 = evenY * 149 */
94 "vmull.u8 q9, d3, d27\n" /* q9 = oddY * 149 */
96 "vld1.16 {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
97 "vsubw.u8 q10, q10, d4\n" /* red acc -= (V >> 1) */
98 "vmlsl.u8 q10, d2, d28\n" /* red acc -= V * 204 */
99 "vld1.16 {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
100 "vmlsl.u8 q11, d2, d30\n" /* green acc -= V * 104 */
101 "vmlsl.u8 q11, d0, d29\n" /* green acc -= U * 50 */
102 "vld1.16 {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
103 "vmlsl.u8 q12, d0, d30\n" /* blue acc -= U * 104 */
104 "vmlsl.u8 q12, d0, d31\n" /* blue acc -= U * 154 */
106 "vhsub.s16 q3, q8, q10\n" /* calculate even red components */
107 "vhsub.s16 q10, q9, q10\n" /* calculate odd red components */
108 "vqshrun.s16 d0, q3, #6\n" /* right shift, narrow and saturate even red components */
109 "vqshrun.s16 d3, q10, #6\n" /* right shift, narrow and saturate odd red components */
111 "vhadd.s16 q3, q8, q11\n" /* calculate even green components */
112 "vhadd.s16 q11, q9, q11\n" /* calculate odd green components */
113 "vqshrun.s16 d1, q3, #6\n" /* right shift, narrow and saturate even green components */
114 "vqshrun.s16 d4, q11, #6\n" /* right shift, narrow and saturate odd green components */
116 "vhsub.s16 q3, q8, q12\n" /* calculate even blue components */
117 "vhsub.s16 q12, q9, q12\n" /* calculate odd blue components */
118 "vqshrun.s16 d2, q3, #6\n" /* right shift, narrow and saturate even blue components */
119 "vqshrun.s16 d5, q12, #6\n" /* right shift, narrow and saturate odd blue components */
121 "vzip.8 d0, d3\n" /* join even and odd red components */
122 "vzip.8 d1, d4\n" /* join even and odd green components */
123 "vzip.8 d2, d5\n" /* join even and odd blue components */
125 "vshll.u8 q3, d0, #8\n\t"
126 "vshll.u8 q8, d1, #8\n\t"
127 "vshll.u8 q9, d2, #8\n\t"
128 "vsri.u16 q3, q8, #5\t\n"
129 "vsri.u16 q3, q9, #11\t\n"
130 /* store pixel data to memory */
131 ".if \\size == 16\n"
132 " vst1.16 {d6, d7}, [%[dst]]!\n"
133 " vshll.u8 q3, d3, #8\n\t"
134 " vshll.u8 q8, d4, #8\n\t"
135 " vshll.u8 q9, d5, #8\n\t"
136 " vsri.u16 q3, q8, #5\t\n"
137 " vsri.u16 q3, q9, #11\t\n"
138 " vst1.16 {d6, d7}, [%[dst]]!\n"
139 ".elseif \\size == 8\n"
140 " vst1.16 {d6, d7}, [%[dst]]!\n"
141 ".elseif \\size == 4\n"
142 " vst1.16 {d6}, [%[dst]]!\n"
143 ".elseif \\size == 2\n"
144 " vst1.16 {d6[0]}, [%[dst]]!\n"
145 " vst1.16 {d6[1]}, [%[dst]]!\n"
146 ".elseif \\size == 1\n"
147 " vst1.16 {d6[0]}, [%[dst]]!\n"
148 ".endif\n"
149 ".endm\n"
151 "vmov.u8 d8, #15\n" /* add this to U/V to saturate upper boundary */
152 "vmov.u8 d9, #20\n" /* add this to Y to saturate upper boundary */
153 "vmov.u8 d10, #31\n" /* sub this from U/V to saturate lower boundary */
154 "vmov.u8 d11, #36\n" /* sub this from Y to saturate lower boundary */
156 "vmov.u8 d26, #16\n"
157 "vmov.u8 d27, #149\n"
158 "vmov.u8 d28, #204\n"
159 "vmov.u8 d29, #50\n"
160 "vmov.u8 d30, #104\n"
161 "vmov.u8 d31, #154\n"
163 "cmp %[oddflag], #0\n"
164 "beq 1f\n"
165 "convert_macroblock 1\n"
166 "sub %[n], %[n], #1\n"
167 "1:\n"
168 "subs %[n], %[n], #16\n"
169 "blt 2f\n"
170 "1:\n"
171 "convert_macroblock 16\n"
172 "subs %[n], %[n], #16\n"
173 "bge 1b\n"
174 "2:\n"
175 "tst %[n], #8\n"
176 "beq 3f\n"
177 "convert_macroblock 8\n"
178 "3:\n"
179 "tst %[n], #4\n"
180 "beq 4f\n"
181 "convert_macroblock 4\n"
182 "4:\n"
183 "tst %[n], #2\n"
184 "beq 5f\n"
185 "convert_macroblock 2\n"
186 "5:\n"
187 "tst %[n], #1\n"
188 "beq 6f\n"
189 "convert_macroblock 1\n"
190 "6:\n"
191 ".purgem convert_macroblock\n"
192 : [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
193 : [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
194 [oddflag] "r" (oddflag)
195 : "cc", "memory",
196 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
197 "d8", "d9", "d10", "d11", /* "d12", "d13", "d14", "d15", */
198 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
199 "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"