1 @*****************************************************************************
2 @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
3 @*****************************************************************************
4 @ Copyright (C) 2011 Sébastien Toque
7 @ This program is free software; you can redistribute it and/or modify it
8 @ under the terms of the GNU Lesser General Public License as published by
9 @ the Free Software Foundation; either version 2.1 of the License, or
10 @ (at your option) any later version.
12 @ This program is distributed in the hope that it will be useful,
13 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 @ GNU Lesser General Public License for more details.
17 @ You should have received a copy of the GNU Lesser General Public License
18 @ along with this program; if not, write to the Free Software Foundation,
19 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20 @****************************************************************************/
84 .type i420_rgb_neon, %function
86 push {r4-r8,r10-r11,lr}
90 ldmia r0, {O1, OPITCH}
91 ldmia r1, {Y1, U, V, YPITCH}
93 /* round the width to be a multiple of 16 */
95 sub WIDTH, WIDTH, OPAD
96 addne WIDTH, WIDTH, #16
98 /* init constants (scale value by 64) */
104 adr OPAD, coefficients
105 vld1.s16 {d6[], d7[]}, [OPAD]!
106 vld1.s16 {d8[], d9[]}, [OPAD]!
107 vld1.s16 {d10[], d11[]}, [OPAD]!
112 sub OPAD, OPITCH, WIDTH, lsl #2
113 sub YPAD, YPITCH, WIDTH
119 /* exit if all rows have been processed */
121 pople {r4-r8,r10-r11,pc}
127 vld1.u8 {u}, [U,:64]!
128 vld1.u8 {v}, [V,:64]!
131 vld2.u8 {y1,y2}, [Y1,:128]!
133 vmull.u8 Q14, v, coefRV
134 vmull.u8 Q11, u, coefGU
135 vmull.u8 Q13, u, coefBU
136 vmlal.u8 Q11, v, coefGV
138 vmull.u8 lumi2, y2, coefY
139 vmull.u8 lumi1, y1, coefY
140 vadd.s16 chro_r, Rc, Q14
141 vadd.s16 chro_b, Bc, Q13
142 vsub.s16 chro_g, Gc, Q11
147 /* chrominance + luminance */
148 vqadd.s16 red16_2, lumi2, chro_r
149 vqadd.s16 blue16_2, lumi2, chro_b
150 vqadd.s16 green16_2, lumi2, chro_g
151 vqadd.s16 red16_1, lumi1, chro_r
152 vqadd.s16 green16_1, lumi1, chro_g
153 vqadd.s16 blue16_1, lumi1, chro_b
155 /* clamp (divide by 64) */
156 vqrshrun.s16 blue2, blue16_2, #6
157 vqrshrun.s16 red2, red16_2, #6
158 vqrshrun.s16 green2, green16_2, #6
159 vqrshrun.s16 red1, red16_1, #6
160 vqrshrun.s16 green1, green16_1, #6
161 vqrshrun.s16 blue1, blue16_1, #6
166 vld2.u8 {y1,y2}, [Y2,:128]!
170 vzip.u8 green1, green2
173 vmull.u8 lumi2, y2, coefY
174 vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
175 vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
177 /* chrominance + luminance */
178 vmull.u8 lumi1, y1, coefY
179 vqadd.s16 red16_2, lumi2, chro_r
180 vqadd.s16 green16_2, lumi2, chro_g
181 vqadd.s16 blue16_2, lumi2, chro_b
182 vqadd.s16 red16_1, lumi1, chro_r
183 vqadd.s16 green16_1, lumi1, chro_g
184 vqadd.s16 blue16_1, lumi1, chro_b
186 /* clamp (divide by 64) */
187 vqrshrun.s16 blue2, blue16_2, #6
188 vqrshrun.s16 red2, red16_2, #6
189 vqrshrun.s16 green2, green16_2, #6
190 vqrshrun.s16 red1, red16_1, #6
191 vqrshrun.s16 green1, green16_1, #6
192 vqrshrun.s16 blue1, blue16_1, #6
198 vzip.u8 green1, green2
201 vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
202 vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
204 /* next columns (x16) */
205 subs COUNT, COUNT, #16
212 add U, U, YPAD, lsr #1
213 add V, V, YPAD, lsr #1