1 @*****************************************************************************
2 @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
3 @*****************************************************************************
4 @ Copyright (C) 2011 Sébastien Toque
7 @ This program is free software; you can redistribute it and/or modify it
8 @ under the terms of the GNU Lesser General Public License as published by
9 @ the Free Software Foundation; either version 2.1 of the License, or
10 @ (at your option) any later version.
12 @ This program is distributed in the hope that it will be useful,
13 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 @ GNU Lesser General Public License for more details.
17 @ You should have received a copy of the GNU Lesser General Public License
18 @ along with this program; if not, write to the Free Software Foundation,
19 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20 @****************************************************************************/
25 #if HAVE_AS_FPU_DIRECTIVE
87 function i420_rgb_neon
88 push {r4-r8,r10-r11,lr}
92 ldmia r0, {O1, OPITCH}
93 ldmia r1, {Y1, U, V, YPITCH}
95 /* round the width to be a multiple of 16 */
97 sub WIDTH, WIDTH, OPAD
99 addne WIDTH, WIDTH, #16
101 /* init constants (scale value by 64) */
107 adr OPAD, coefficients
108 vld1.s16 {d6[], d7[]}, [OPAD]!
109 vld1.s16 {d8[], d9[]}, [OPAD]!
110 vld1.s16 {d10[], d11[]}, [OPAD]!
115 sub OPAD, OPITCH, WIDTH, lsl #2
116 sub YPAD, YPITCH, WIDTH
123 /* exit if all rows have been processed */
126 pople {r4-r8,r10-r11,pc}
132 vld1.u8 {u}, [U,:64]!
133 vld1.u8 {v}, [V,:64]!
136 vld2.u8 {y1,y2}, [Y1,:128]!
138 vmull.u8 Q14, v, coefRV
139 vmull.u8 Q11, u, coefGU
140 vmull.u8 Q13, u, coefBU
141 vmlal.u8 Q11, v, coefGV
143 vmull.u8 lumi2, y2, coefY
144 vmull.u8 lumi1, y1, coefY
145 vadd.s16 chro_r, Rc, Q14
146 vadd.s16 chro_b, Bc, Q13
147 vsub.s16 chro_g, Gc, Q11
152 /* chrominance + luminance */
153 vqadd.s16 red16_2, lumi2, chro_r
154 vqadd.s16 blue16_2, lumi2, chro_b
155 vqadd.s16 green16_2, lumi2, chro_g
156 vqadd.s16 red16_1, lumi1, chro_r
157 vqadd.s16 green16_1, lumi1, chro_g
158 vqadd.s16 blue16_1, lumi1, chro_b
160 /* clamp (divide by 64) */
161 vqrshrun.s16 blue2, blue16_2, #6
162 vqrshrun.s16 red2, red16_2, #6
163 vqrshrun.s16 green2, green16_2, #6
164 vqrshrun.s16 red1, red16_1, #6
165 vqrshrun.s16 green1, green16_1, #6
166 vqrshrun.s16 blue1, blue16_1, #6
171 vld2.u8 {y1,y2}, [Y2,:128]!
175 vzip.u8 green1, green2
178 vmull.u8 lumi2, y2, coefY
179 vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
180 vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
182 /* chrominance + luminance */
183 vmull.u8 lumi1, y1, coefY
184 vqadd.s16 red16_2, lumi2, chro_r
185 vqadd.s16 green16_2, lumi2, chro_g
186 vqadd.s16 blue16_2, lumi2, chro_b
187 vqadd.s16 red16_1, lumi1, chro_r
188 vqadd.s16 green16_1, lumi1, chro_g
189 vqadd.s16 blue16_1, lumi1, chro_b
191 /* clamp (divide by 64) */
192 vqrshrun.s16 blue2, blue16_2, #6
193 vqrshrun.s16 red2, red16_2, #6
194 vqrshrun.s16 green2, green16_2, #6
195 vqrshrun.s16 red1, red16_1, #6
196 vqrshrun.s16 green1, green16_1, #6
197 vqrshrun.s16 blue1, blue16_1, #6
203 vzip.u8 green1, green2
206 vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
207 vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
209 /* next columns (x16) */
210 subs COUNT, COUNT, #16
217 add U, U, YPAD, lsr #1
218 add V, V, YPAD, lsr #1