1 @*****************************************************************************
2 @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
3 @*****************************************************************************
4 @ Copyright (C) 2011 Sébastien Toque
7 @ This program is free software; you can redistribute it and/or modify it
8 @ under the terms of the GNU Lesser General Public License as published by
9 @ the Free Software Foundation; either version 2.1 of the License, or
10 @ (at your option) any later version.
12 @ This program is distributed in the hope that it will be useful,
13 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 @ GNU Lesser General Public License for more details.
17 @ You should have received a copy of the GNU Lesser General Public License
18 @ along with this program; if not, write to the Free Software Foundation,
19 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20 @****************************************************************************/
86 .global i420_rv16_neon
87 .type i420_rv16_neon, %function
89 push {r4-r8,r10-r11,lr}
93 ldmia r0, {O1, OPITCH}
94 ldmia r1, {Y1, U, V, YPITCH}
96 /* round the width to be a multiple of 16 */
98 sub WIDTH, WIDTH, OPAD
99 addne WIDTH, WIDTH, #16
101 /* init constants (scale value by 64) */
107 adr OPAD, coefficients
108 vld1.s16 {d6[], d7[]}, [OPAD]!
109 vld1.s16 {d8[], d9[]}, [OPAD]!
110 vld1.s16 {d10[], d11[]}, [OPAD]!
114 sub OPAD, OPITCH, WIDTH, lsl #1
115 sub YPAD, YPITCH, WIDTH
121 /* exit if all rows have been processed */
123 pople {r4-r8,r10-r11,pc}
129 vld1.u8 {u}, [U,:64]!
130 vld1.u8 {v}, [V,:64]!
133 vld2.u8 {y1,y2}, [Y1,:128]!
135 vmull.u8 Q14, v, coefRV
136 vmull.u8 Q11, u, coefGU
137 vmull.u8 Q13, u, coefBU
138 vmlal.u8 Q11, v, coefGV
140 vmull.u8 lumi2, y2, coefY
141 vmull.u8 lumi1, y1, coefY
142 vadd.s16 chro_r, Rc, Q14
143 vadd.s16 chro_b, Bc, Q13
144 vsub.s16 chro_g, Gc, Q11
149 /* chrominance + luminance */
150 vqadd.s16 red16_2, lumi2, chro_r
151 vqadd.s16 green16_2, lumi2, chro_g
152 vqadd.s16 blue16_2, lumi2, chro_b
153 vqadd.s16 red16_1, lumi1, chro_r
154 vqadd.s16 green16_1, lumi1, chro_g
155 vqadd.s16 blue16_1, lumi1, chro_b
157 /* clamp (divide by 64) */
158 vqrshrun.s16 green2, green16_2, #6
159 vqrshrun.s16 blue2, blue16_2, #6
160 vqrshrun.s16 red2, red16_2, #6
161 vqrshrun.s16 green1, green16_1, #6
162 vqrshrun.s16 red1, red16_1, #6
163 vqrshrun.s16 blue1, blue16_1, #6
167 /* pack into RGB565 */
168 vshl.u8 out2l, green2, #3 // low 2a
169 vsri.u8 out2h, green2, #5 // high 2
170 vshl.u8 out1l, green1, #3 // low 1a
171 vsri.u8 out1h, green1, #5 // high 1
172 vsri.u8 out2l, blue2, #3 // low 2b
173 vsri.u8 out1l, blue1, #3 // low 1b
176 vld2.u8 {y1,y2}, [Y2,:128]!
180 vmull.u8 lumi2, y2, coefY
182 vmull.u8 lumi1, y1, coefY
183 vst2.u8 {out1l, out1h}, [O1,:128]!
184 vst2.u8 {out2l, out2h}, [O1,:128]!
186 /* chrominance + luminance */
187 vqadd.s16 green16_2, lumi2, chro_g
188 vqadd.s16 red16_2, lumi2, chro_r
189 vqadd.s16 blue16_2, lumi2, chro_b
190 vqadd.s16 red16_1, lumi1, chro_r
191 vqadd.s16 green16_1, lumi1, chro_g
192 vqadd.s16 blue16_1, lumi1, chro_b
194 /* clamp (divide by 64) */
195 vqrshrun.s16 green2, green16_2, #6
196 vqrshrun.s16 blue2, blue16_2, #6
197 vqrshrun.s16 red2, red16_2, #6
198 vqrshrun.s16 green1, green16_1, #6
199 vqrshrun.s16 red1, red16_1, #6
200 vqrshrun.s16 blue1, blue16_1, #6
204 /* pack into RGB565 */
205 vshl.u8 out2l, green2, #3 // low 2a
206 vsri.u8 out2h, green2, #5 // high 2
207 vshl.u8 out1l, green1, #3 // low 1a
208 vsri.u8 out1h, green1, #5 // high 1
209 vsri.u8 out2l, blue2, #3 // low 2b
210 vsri.u8 out1l, blue1, #3 // low 1b
214 vst2.u8 {out1l, out1h}, [O2,:128]!
215 vst2.u8 {out2l, out2h}, [O2,:128]!
217 /* next columns (x16) */
218 subs COUNT, COUNT, #16
225 add U, U, YPAD, lsr #1
226 add V, V, YPAD, lsr #1