1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 /* Allow to build on targets not supporting neon, and force the object file
8 * target to avoid bumping the final binary target */
14 YCbCr42xToRGB565_DITHER03_CONSTS_NEON
:
25 YCbCr42xToRGB565_DITHER12_CONSTS_NEON
:
36 YCbCr42xToRGB565_DITHER21_CONSTS_NEON
:
47 YCbCr42xToRGB565_DITHER30_CONSTS_NEON
:
59 @ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
(
60 @ yuv2rgb565_row_scale_bilinear_ctx
*ctx
, int dither
);
63 @ uint16_t
*rgb_row;
/*r0*/
64 @ const uint8_t
*y_row;
/*r1*/
65 @ const uint8_t
*u_row;
/*r2*/
66 @ const uint8_t
*v_row;
/*r3*/
67 @ int y_yweight;
/*r4*/
70 @ int source_x0_q16;
/*r7*/
71 @ int source_dx_q16;
/*r8*/
72 @ int source_uv_xoffs_q16;
/*r9*/
74 .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
75 .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
78 ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
:
79 STMFD
r13!,{r4-
r9,r14} @
8 words.
80 ADR
r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
81 VPUSH
{Q4-Q7
} @
16 words.
82 ADD r14,r14,r1, LSL
#4 @ Select the dither table to use
84 @ Set up image index registers.
86 VMOV.I32 D16
,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16
89 VTRN.32 D16
,D17 @ Q2
= < 2|
0|
2|
0>*source_dx_q16
90 VDUP.32 D19
,r12 @ Q9
= < 4|
4| ?| ?
>*source_dx_q16
92 VDUP.32 Q0
, r7 @ Q0
= < 1|
1|
1|
1>*source_x0_q16
93 VADD.I32 D17
,D17
,D19 @ Q8
= < 6|
4|
2|
0>*source_dx_q16
94 CMP r8, #0 @ If source_dx_q16 is negative...
95 VDUP.32 Q9
, r12 @ Q9
= < 8|
8|
8|
8>*source_dx_q16
96 ADDLT
r7, r7, r8, LSL
#4 @ Make r7 point to the end of the block
97 VADD.I32 Q0
, Q0
, Q8 @ Q0
= < 6|
4|
2|
0>*source_dx_q16+source_x0_q16
98 SUBLT
r7, r7, r8 @
(i.e.
, the lowest address we
'll use)
99 VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
100 VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16
101 VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
102 VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
103 VLD1.64 {D30,D31},[r14,:128] @ Load some constants
106 @ The basic idea here is to do aligned loads of a block of data and then
107 @ index into it using VTBL to extract the data from the source X
108 @ coordinate corresponding to each destination pixel.
109 @ This is significantly less code and significantly fewer cycles than doing
110 @ a series of single-lane loads, but it means that the X step between
111 @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
112 @ that we could read
8 pixels from
a single aligned
32-byte block of data.
113 @ Q0.
..Q3 contain the 16.16 fixed-point X coordinates of each pixel,
114 @ separated into even pixels
and odd pixels to make extracting offsets
and
116 @ We then pull out two bytes from the middle of each coordinate
: the top
117 @ byte corresponds to the integer part of the X coordinate
, and the bottom
118 @ byte corresponds to the weight to use for bilinear blending.
119 @ These are separated out into different registers with VTRN.
120 @ Then by subtracting the integer X coordinate of the first pixel in the
121 @ data block we loaded
, we produce an index register suitable for use by
126 VRSHRN.S32 D16,Q0, #8
127 AND r12,r12,#~15 @ Read 16-byte aligned blocks
129 ADD r12,r1, r12 @ r12 = y_row+(source_x&~7)
130 VRSHRN.S32 D17,Q1, #8
132 VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row
133 ADD r14,r7, r8, LSL
#3
134 VRSHRN.S32 D18
,Q2
, #8
136 VRSHRN.S32 D19
,Q3
, #8
137 AND r14,r14,#~15 @ Read 16-byte aligned blocks
138 VLD1.64
{D12
,D13
,D14
,D15
},[r12,:128] @ Load Y
' bottom row
141 ADD r14,r1, r14 @ r14 = y_row+(source_x&~7)
144 VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
145 @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
146 VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded.
148 VTBL.8 D20
,{D8
, D9
, D10
,D11
},D18 @ Index top row at source_x
149 VTBL.8 D24
,{D12
,D13
,D14
,D15
},D18 @ Index bottom row at source_x
150 VADD.S8 Q13
,Q9
, Q13 @
Add 1 to source_x
151 VTBL.8 D22
,{D8
, D9
, D10
,D11
},D26 @ Index top row at source_x+
1
152 VTBL.8 D26
,{D12
,D13
,D14
,D15
},D26 @ Index bottom row at source_x+
1
154 VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row
155 VLD1.64
{D12
,D13
,D14
,D15
},[r14,:128] @ Load Y
' bottom row
157 VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x
158 VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x
159 VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1
160 VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1
162 VDUP.I16 Q9
, r4 @ Load the y weights.
163 VSUBL.U8 Q4
, D24
,D20 @ Q5
:Q4
= c-
a
165 VSUBL.U8 Q6
, D26
,D22 @ Q7
:Q6
= d-
b
167 VMUL.S16 Q4
, Q4
, Q9 @ Q5
:Q4
= (c-
a)*yweight
169 VMUL.S16 Q6
, Q6
, Q9 @ Q7
:Q6
= (d-
b)*yweight
171 VMOVL.U8 Q12
,D16 @ Promote the x weights to
16 bits.
172 VMOVL.U8 Q13
,D17 @ Sadly
, there
's no VMULW.
173 VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8
174 VRSHRN.S16 D9, Q5, #8
175 VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8
176 VRSHRN.S16 D13,Q7, #8
177 VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8)
178 VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8)
179 VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a
181 VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight
183 VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8
185 VRSHRN.S16 D9, Q5, #8
187 VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8)
188 @ Start extracting the chroma x coordinates, and load Cb and Cr.
189 AND r12,r12,#~15 @ Read 16-byte aligned blocks
190 VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4
193 VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb
198 VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr
201 VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
202 VRSHRN.S32 D21,Q11,#9
204 VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
205 VRSHRN.S32 D23,Q13,#9
206 @ We don't actually need the x weights
, but we get them for free.
208 VTRN.8 Q10
,Q11 @ Q10
= <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0
>
209 @ Free ALU slot @ Q11
= <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0
>
210 VSUB.S8 Q11
,Q11
,Q9 @ Make offsets relative to the data we loaded.
211 VTBL.8 D18
,{D8
, D9
, D10
,D11
},D22 @ Index Cb at source_x
213 VTBL.8 D19
,{D8
, D9
, D10
,D11
},D23
215 VTBL.8 D20
,{D12
,D13
,D14
,D15
},D22 @ Index
Cr at source_x
217 VTBL.8 D21
,{D12
,D13
,D14
,D15
},D23
218 @ We now have Y
' in Q8, Cb in Q9, and Cr in Q10
219 @ We use VDUP to expand constants, because it's
a permute instruction
, so
220 @ it can dual issue on the A8.
221 SUBS
r6, r6, #16 @ width -= 16
222 VMULL.U8 Q4
, D16
,D24 @ Q5
:Q4
= Y
'*74
223 VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G
226 VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G
227 VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R
230 VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R
231 VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B
234 VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B
236 VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G
238 VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R
240 VQADD.S16 Q8
, Q4
, Q8 @ Q13
:Q8
= 74*Y
'+129*Cr+bias_B
241 VQADD.S16 Q13,Q5, Q13
242 VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-
52*Cr+bias_G
244 @ Push each value to the top of its word
and saturate it.
245 VQSHLU.S16 Q11
,Q11
,#2
246 VQSHLU.S16 Q12
,Q12
,#2
247 VQSHLU.S16 Q6
, Q6
, #2
248 VQSHLU.S16 Q7
, Q7
, #2
249 VQSHLU.S16 Q8
, Q8
, #2
250 VQSHLU.S16 Q13
,Q13
,#2
251 @ Merge G
and B into R.
257 BLT s42xbily_neon_tail
260 VST1.16
{D22
,D23
,D24
,D25
},[r0]!
261 BEQ s42xbily_neon_done
262 @ Advance the x coordinates.
270 @ We have between
1 and 15 pixels left to write.
271 @
-r6 == the number of pixels we need to skip writing.
272 @ Adjust
r0 to point to the last one we need to write
, because we
're going
273 @ to write them in reverse order.
274 ADD r0, r0, r6, LSL #1
277 @ Skip past the ones we don't need to write.
278 SUB PC
, PC
, r6, LSL
#2
280 VST1.16
{D25
[3]},[r0,:16],r14
281 VST1.16
{D25
[2]},[r0,:16],r14
282 VST1.16
{D25
[1]},[r0,:16],r14
283 VST1.16
{D25
[0]},[r0,:16],r14
284 VST1.16
{D24
[3]},[r0,:16],r14
285 VST1.16
{D24
[2]},[r0,:16],r14
286 VST1.16
{D24
[1]},[r0,:16],r14
287 VST1.16
{D24
[0]},[r0,:16],r14
288 VST1.16
{D23
[3]},[r0,:16],r14
289 VST1.16
{D23
[2]},[r0,:16],r14
290 VST1.16
{D23
[1]},[r0,:16],r14
291 VST1.16
{D23
[0]},[r0,:16],r14
292 VST1.16
{D22
[3]},[r0,:16],r14
293 VST1.16
{D22
[2]},[r0,:16],r14
294 VST1.16
{D22
[1]},[r0,:16],r14
295 VST1.16
{D22
[0]},[r0,:16]
297 VPOP
{Q4-Q7
} @
16 words.
298 LDMFD
r13!,{r4-
r9,PC
} @
8 words.
300 .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
302 #if defined(__ELF__)&&defined(__linux__)
303 .section .note.GNU-stack,"",%progbits