1 @*****************************************************************************
2 @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
3 @*****************************************************************************
4 @ Copyright (C) 2011 Sébastien Toque
7 @ This program is free software; you can redistribute it and/or modify it
8 @ under the terms of the GNU Lesser General Public License as published by
9 @ the Free Software Foundation; either version 2.1 of the License, or
10 @ (at your option) any later version.
12 @ This program is distributed in the hope that it will be useful,
13 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 @ GNU Lesser General Public License for more details.
17 @ You should have received a copy of the GNU Lesser General Public License
18 @ along with this program; if not, write to the Free Software Foundation,
19 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20 @****************************************************************************/
25 #if HAVE_AS_FPU_DIRECTIVE
90 function i420_rv16_neon
91 push {r4-r8,r10-r11,lr}
95 ldmia r0, {O1, OPITCH}
96 ldmia r1, {Y1, U, V, YPITCH}
98 /* round the width to be a multiple of 16 */
100 sub WIDTH, WIDTH, OPAD
102 addne WIDTH, WIDTH, #16
104 /* init constants (scale value by 64) */
110 adr OPAD, coefficients
111 vld1.s16 {d6[], d7[]}, [OPAD]!
112 vld1.s16 {d8[], d9[]}, [OPAD]!
113 vld1.s16 {d10[], d11[]}, [OPAD]!
117 sub OPAD, OPITCH, WIDTH, lsl #1
118 sub YPAD, YPITCH, WIDTH
125 /* exit if all rows have been processed */
128 pople {r4-r8,r10-r11,pc}
134 vld1.u8 {u}, [U,:64]!
135 vld1.u8 {v}, [V,:64]!
138 vld2.u8 {y1,y2}, [Y1,:128]!
140 vmull.u8 Q14, v, coefRV
141 vmull.u8 Q11, u, coefGU
142 vmull.u8 Q13, u, coefBU
143 vmlal.u8 Q11, v, coefGV
145 vmull.u8 lumi2, y2, coefY
146 vmull.u8 lumi1, y1, coefY
147 vadd.s16 chro_r, Rc, Q14
148 vadd.s16 chro_b, Bc, Q13
149 vsub.s16 chro_g, Gc, Q11
154 /* chrominance + luminance */
155 vqadd.s16 red16_2, lumi2, chro_r
156 vqadd.s16 green16_2, lumi2, chro_g
157 vqadd.s16 blue16_2, lumi2, chro_b
158 vqadd.s16 red16_1, lumi1, chro_r
159 vqadd.s16 green16_1, lumi1, chro_g
160 vqadd.s16 blue16_1, lumi1, chro_b
162 /* clamp (divide by 64) */
163 vqrshrun.s16 green2, green16_2, #6
164 vqrshrun.s16 blue2, blue16_2, #6
165 vqrshrun.s16 red2, red16_2, #6
166 vqrshrun.s16 green1, green16_1, #6
167 vqrshrun.s16 red1, red16_1, #6
168 vqrshrun.s16 blue1, blue16_1, #6
172 /* pack into RGB565 */
173 vshl.u8 out2l, green2, #3 // low 2a
174 vsri.u8 out2h, green2, #5 // high 2
175 vshl.u8 out1l, green1, #3 // low 1a
176 vsri.u8 out1h, green1, #5 // high 1
177 vsri.u8 out2l, blue2, #3 // low 2b
178 vsri.u8 out1l, blue1, #3 // low 1b
181 vld2.u8 {y1,y2}, [Y2,:128]!
185 vmull.u8 lumi2, y2, coefY
187 vmull.u8 lumi1, y1, coefY
188 vst2.u8 {out1l, out1h}, [O1,:128]!
189 vst2.u8 {out2l, out2h}, [O1,:128]!
191 /* chrominance + luminance */
192 vqadd.s16 green16_2, lumi2, chro_g
193 vqadd.s16 red16_2, lumi2, chro_r
194 vqadd.s16 blue16_2, lumi2, chro_b
195 vqadd.s16 red16_1, lumi1, chro_r
196 vqadd.s16 green16_1, lumi1, chro_g
197 vqadd.s16 blue16_1, lumi1, chro_b
199 /* clamp (divide by 64) */
200 vqrshrun.s16 green2, green16_2, #6
201 vqrshrun.s16 blue2, blue16_2, #6
202 vqrshrun.s16 red2, red16_2, #6
203 vqrshrun.s16 green1, green16_1, #6
204 vqrshrun.s16 red1, red16_1, #6
205 vqrshrun.s16 blue1, blue16_1, #6
209 /* pack into RGB565 */
210 vshl.u8 out2l, green2, #3 // low 2a
211 vsri.u8 out2h, green2, #5 // high 2
212 vshl.u8 out1l, green1, #3 // low 1a
213 vsri.u8 out1h, green1, #5 // high 1
214 vsri.u8 out2l, blue2, #3 // low 2b
215 vsri.u8 out1l, blue1, #3 // low 1b
219 vst2.u8 {out1l, out1h}, [O2,:128]!
220 vst2.u8 {out2l, out2h}, [O2,:128]!
222 /* next columns (x16) */
223 subs COUNT, COUNT, #16
230 add U, U, YPAD, lsr #1
231 add V, V, YPAD, lsr #1