2 * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
4 * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
15 * 1. The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * 2. Altered source versions must be plainly marked as such, and must not be
20 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
24 #define JPEG_INTERNALS
25 #include "../../jinclude.h"
26 #include "../../jpeglib.h"
27 #include "../../jsimd.h"
28 #include "../../jdct.h"
29 #include "../../jsimddct.h"
32 #include "neon-compat.h"
37 /* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
38 * (Discrete Cosine Transform) on one block of samples. It uses the same
39 * calculations and produces exactly the same output as IJG's original
40 * jpeg_fdct_islow() function, which can be found in jfdctint.c.
42 * Scaled integer constants are used to avoid floating-point arithmetic:
43 * 0.298631336 = 2446 * 2^-13
44 * 0.390180644 = 3196 * 2^-13
45 * 0.541196100 = 4433 * 2^-13
46 * 0.765366865 = 6270 * 2^-13
47 * 0.899976223 = 7373 * 2^-13
48 * 1.175875602 = 9633 * 2^-13
49 * 1.501321110 = 12299 * 2^-13
50 * 1.847759065 = 15137 * 2^-13
51 * 1.961570560 = 16069 * 2^-13
52 * 2.053119869 = 16819 * 2^-13
53 * 2.562915447 = 20995 * 2^-13
54 * 3.072711026 = 25172 * 2^-13
56 * See jfdctint.c for further details of the DCT algorithm. Where possible,
57 * the variable names and comments here in jsimd_fdct_islow_neon() match up
58 * with those in jpeg_fdct_islow().
64 #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
65 #define DESCALE_P2 (CONST_BITS + PASS1_BITS)
81 ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts
[] = {
82 F_0_298
, -F_0_390
, F_0_541
, F_0_765
,
83 -F_0_899
, F_1_175
, F_1_501
, -F_1_847
,
84 -F_1_961
, F_2_053
, -F_2_562
, F_3_072
87 void jsimd_fdct_islow_neon(DCTELEM
*data
)
89 /* Load DCT constants. */
90 #ifdef HAVE_VLD1_S16_X3
91 const int16x4x3_t consts
= vld1_s16_x3(jsimd_fdct_islow_neon_consts
);
93 /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
94 const int16x4_t consts1
= vld1_s16(jsimd_fdct_islow_neon_consts
);
95 const int16x4_t consts2
= vld1_s16(jsimd_fdct_islow_neon_consts
+ 4);
96 const int16x4_t consts3
= vld1_s16(jsimd_fdct_islow_neon_consts
+ 8);
97 const int16x4x3_t consts
= { { consts1
, consts2
, consts3
} };
100 /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
101 * are used, followed by vuzp to transpose the block such that we have a
102 * column of samples per vector - allowing all rows to be processed at once.
104 int16x8x4_t s_rows_0123
= vld4q_s16(data
);
105 int16x8x4_t s_rows_4567
= vld4q_s16(data
+ 4 * DCTSIZE
);
107 int16x8x2_t cols_04
= vuzpq_s16(s_rows_0123
.val
[0], s_rows_4567
.val
[0]);
108 int16x8x2_t cols_15
= vuzpq_s16(s_rows_0123
.val
[1], s_rows_4567
.val
[1]);
109 int16x8x2_t cols_26
= vuzpq_s16(s_rows_0123
.val
[2], s_rows_4567
.val
[2]);
110 int16x8x2_t cols_37
= vuzpq_s16(s_rows_0123
.val
[3], s_rows_4567
.val
[3]);
112 int16x8_t col0
= cols_04
.val
[0];
113 int16x8_t col1
= cols_15
.val
[0];
114 int16x8_t col2
= cols_26
.val
[0];
115 int16x8_t col3
= cols_37
.val
[0];
116 int16x8_t col4
= cols_04
.val
[1];
117 int16x8_t col5
= cols_15
.val
[1];
118 int16x8_t col6
= cols_26
.val
[1];
119 int16x8_t col7
= cols_37
.val
[1];
121 /* Pass 1: process rows. */
123 int16x8_t tmp0
= vaddq_s16(col0
, col7
);
124 int16x8_t tmp7
= vsubq_s16(col0
, col7
);
125 int16x8_t tmp1
= vaddq_s16(col1
, col6
);
126 int16x8_t tmp6
= vsubq_s16(col1
, col6
);
127 int16x8_t tmp2
= vaddq_s16(col2
, col5
);
128 int16x8_t tmp5
= vsubq_s16(col2
, col5
);
129 int16x8_t tmp3
= vaddq_s16(col3
, col4
);
130 int16x8_t tmp4
= vsubq_s16(col3
, col4
);
133 int16x8_t tmp10
= vaddq_s16(tmp0
, tmp3
);
134 int16x8_t tmp13
= vsubq_s16(tmp0
, tmp3
);
135 int16x8_t tmp11
= vaddq_s16(tmp1
, tmp2
);
136 int16x8_t tmp12
= vsubq_s16(tmp1
, tmp2
);
138 col0
= vshlq_n_s16(vaddq_s16(tmp10
, tmp11
), PASS1_BITS
);
139 col4
= vshlq_n_s16(vsubq_s16(tmp10
, tmp11
), PASS1_BITS
);
141 int16x8_t tmp12_add_tmp13
= vaddq_s16(tmp12
, tmp13
);
143 vmull_lane_s16(vget_low_s16(tmp12_add_tmp13
), consts
.val
[0], 2);
145 vmull_lane_s16(vget_high_s16(tmp12_add_tmp13
), consts
.val
[0], 2);
147 int32x4_t col2_scaled_l
=
148 vmlal_lane_s16(z1_l
, vget_low_s16(tmp13
), consts
.val
[0], 3);
149 int32x4_t col2_scaled_h
=
150 vmlal_lane_s16(z1_h
, vget_high_s16(tmp13
), consts
.val
[0], 3);
151 col2
= vcombine_s16(vrshrn_n_s32(col2_scaled_l
, DESCALE_P1
),
152 vrshrn_n_s32(col2_scaled_h
, DESCALE_P1
));
154 int32x4_t col6_scaled_l
=
155 vmlal_lane_s16(z1_l
, vget_low_s16(tmp12
), consts
.val
[1], 3);
156 int32x4_t col6_scaled_h
=
157 vmlal_lane_s16(z1_h
, vget_high_s16(tmp12
), consts
.val
[1], 3);
158 col6
= vcombine_s16(vrshrn_n_s32(col6_scaled_l
, DESCALE_P1
),
159 vrshrn_n_s32(col6_scaled_h
, DESCALE_P1
));
162 int16x8_t z1
= vaddq_s16(tmp4
, tmp7
);
163 int16x8_t z2
= vaddq_s16(tmp5
, tmp6
);
164 int16x8_t z3
= vaddq_s16(tmp4
, tmp6
);
165 int16x8_t z4
= vaddq_s16(tmp5
, tmp7
);
167 int32x4_t z5_l
= vmull_lane_s16(vget_low_s16(z3
), consts
.val
[1], 1);
168 int32x4_t z5_h
= vmull_lane_s16(vget_high_s16(z3
), consts
.val
[1], 1);
169 z5_l
= vmlal_lane_s16(z5_l
, vget_low_s16(z4
), consts
.val
[1], 1);
170 z5_h
= vmlal_lane_s16(z5_h
, vget_high_s16(z4
), consts
.val
[1], 1);
172 /* sqrt(2) * (-c1+c3+c5-c7) */
173 int32x4_t tmp4_l
= vmull_lane_s16(vget_low_s16(tmp4
), consts
.val
[0], 0);
174 int32x4_t tmp4_h
= vmull_lane_s16(vget_high_s16(tmp4
), consts
.val
[0], 0);
175 /* sqrt(2) * ( c1+c3-c5+c7) */
176 int32x4_t tmp5_l
= vmull_lane_s16(vget_low_s16(tmp5
), consts
.val
[2], 1);
177 int32x4_t tmp5_h
= vmull_lane_s16(vget_high_s16(tmp5
), consts
.val
[2], 1);
178 /* sqrt(2) * ( c1+c3+c5-c7) */
179 int32x4_t tmp6_l
= vmull_lane_s16(vget_low_s16(tmp6
), consts
.val
[2], 3);
180 int32x4_t tmp6_h
= vmull_lane_s16(vget_high_s16(tmp6
), consts
.val
[2], 3);
181 /* sqrt(2) * ( c1+c3-c5-c7) */
182 int32x4_t tmp7_l
= vmull_lane_s16(vget_low_s16(tmp7
), consts
.val
[1], 2);
183 int32x4_t tmp7_h
= vmull_lane_s16(vget_high_s16(tmp7
), consts
.val
[1], 2);
185 /* sqrt(2) * (c7-c3) */
186 z1_l
= vmull_lane_s16(vget_low_s16(z1
), consts
.val
[1], 0);
187 z1_h
= vmull_lane_s16(vget_high_s16(z1
), consts
.val
[1], 0);
188 /* sqrt(2) * (-c1-c3) */
189 int32x4_t z2_l
= vmull_lane_s16(vget_low_s16(z2
), consts
.val
[2], 2);
190 int32x4_t z2_h
= vmull_lane_s16(vget_high_s16(z2
), consts
.val
[2], 2);
191 /* sqrt(2) * (-c3-c5) */
192 int32x4_t z3_l
= vmull_lane_s16(vget_low_s16(z3
), consts
.val
[2], 0);
193 int32x4_t z3_h
= vmull_lane_s16(vget_high_s16(z3
), consts
.val
[2], 0);
194 /* sqrt(2) * (c5-c3) */
195 int32x4_t z4_l
= vmull_lane_s16(vget_low_s16(z4
), consts
.val
[0], 1);
196 int32x4_t z4_h
= vmull_lane_s16(vget_high_s16(z4
), consts
.val
[0], 1);
198 z3_l
= vaddq_s32(z3_l
, z5_l
);
199 z3_h
= vaddq_s32(z3_h
, z5_h
);
200 z4_l
= vaddq_s32(z4_l
, z5_l
);
201 z4_h
= vaddq_s32(z4_h
, z5_h
);
203 tmp4_l
= vaddq_s32(tmp4_l
, z1_l
);
204 tmp4_h
= vaddq_s32(tmp4_h
, z1_h
);
205 tmp4_l
= vaddq_s32(tmp4_l
, z3_l
);
206 tmp4_h
= vaddq_s32(tmp4_h
, z3_h
);
207 col7
= vcombine_s16(vrshrn_n_s32(tmp4_l
, DESCALE_P1
),
208 vrshrn_n_s32(tmp4_h
, DESCALE_P1
));
210 tmp5_l
= vaddq_s32(tmp5_l
, z2_l
);
211 tmp5_h
= vaddq_s32(tmp5_h
, z2_h
);
212 tmp5_l
= vaddq_s32(tmp5_l
, z4_l
);
213 tmp5_h
= vaddq_s32(tmp5_h
, z4_h
);
214 col5
= vcombine_s16(vrshrn_n_s32(tmp5_l
, DESCALE_P1
),
215 vrshrn_n_s32(tmp5_h
, DESCALE_P1
));
217 tmp6_l
= vaddq_s32(tmp6_l
, z2_l
);
218 tmp6_h
= vaddq_s32(tmp6_h
, z2_h
);
219 tmp6_l
= vaddq_s32(tmp6_l
, z3_l
);
220 tmp6_h
= vaddq_s32(tmp6_h
, z3_h
);
221 col3
= vcombine_s16(vrshrn_n_s32(tmp6_l
, DESCALE_P1
),
222 vrshrn_n_s32(tmp6_h
, DESCALE_P1
));
224 tmp7_l
= vaddq_s32(tmp7_l
, z1_l
);
225 tmp7_h
= vaddq_s32(tmp7_h
, z1_h
);
226 tmp7_l
= vaddq_s32(tmp7_l
, z4_l
);
227 tmp7_h
= vaddq_s32(tmp7_h
, z4_h
);
228 col1
= vcombine_s16(vrshrn_n_s32(tmp7_l
, DESCALE_P1
),
229 vrshrn_n_s32(tmp7_h
, DESCALE_P1
));
231 /* Transpose to work on columns in pass 2. */
232 int16x8x2_t cols_01
= vtrnq_s16(col0
, col1
);
233 int16x8x2_t cols_23
= vtrnq_s16(col2
, col3
);
234 int16x8x2_t cols_45
= vtrnq_s16(col4
, col5
);
235 int16x8x2_t cols_67
= vtrnq_s16(col6
, col7
);
237 int32x4x2_t cols_0145_l
= vtrnq_s32(vreinterpretq_s32_s16(cols_01
.val
[0]),
238 vreinterpretq_s32_s16(cols_45
.val
[0]));
239 int32x4x2_t cols_0145_h
= vtrnq_s32(vreinterpretq_s32_s16(cols_01
.val
[1]),
240 vreinterpretq_s32_s16(cols_45
.val
[1]));
241 int32x4x2_t cols_2367_l
= vtrnq_s32(vreinterpretq_s32_s16(cols_23
.val
[0]),
242 vreinterpretq_s32_s16(cols_67
.val
[0]));
243 int32x4x2_t cols_2367_h
= vtrnq_s32(vreinterpretq_s32_s16(cols_23
.val
[1]),
244 vreinterpretq_s32_s16(cols_67
.val
[1]));
246 int32x4x2_t rows_04
= vzipq_s32(cols_0145_l
.val
[0], cols_2367_l
.val
[0]);
247 int32x4x2_t rows_15
= vzipq_s32(cols_0145_h
.val
[0], cols_2367_h
.val
[0]);
248 int32x4x2_t rows_26
= vzipq_s32(cols_0145_l
.val
[1], cols_2367_l
.val
[1]);
249 int32x4x2_t rows_37
= vzipq_s32(cols_0145_h
.val
[1], cols_2367_h
.val
[1]);
251 int16x8_t row0
= vreinterpretq_s16_s32(rows_04
.val
[0]);
252 int16x8_t row1
= vreinterpretq_s16_s32(rows_15
.val
[0]);
253 int16x8_t row2
= vreinterpretq_s16_s32(rows_26
.val
[0]);
254 int16x8_t row3
= vreinterpretq_s16_s32(rows_37
.val
[0]);
255 int16x8_t row4
= vreinterpretq_s16_s32(rows_04
.val
[1]);
256 int16x8_t row5
= vreinterpretq_s16_s32(rows_15
.val
[1]);
257 int16x8_t row6
= vreinterpretq_s16_s32(rows_26
.val
[1]);
258 int16x8_t row7
= vreinterpretq_s16_s32(rows_37
.val
[1]);
260 /* Pass 2: process columns. */
262 tmp0
= vaddq_s16(row0
, row7
);
263 tmp7
= vsubq_s16(row0
, row7
);
264 tmp1
= vaddq_s16(row1
, row6
);
265 tmp6
= vsubq_s16(row1
, row6
);
266 tmp2
= vaddq_s16(row2
, row5
);
267 tmp5
= vsubq_s16(row2
, row5
);
268 tmp3
= vaddq_s16(row3
, row4
);
269 tmp4
= vsubq_s16(row3
, row4
);
272 tmp10
= vaddq_s16(tmp0
, tmp3
);
273 tmp13
= vsubq_s16(tmp0
, tmp3
);
274 tmp11
= vaddq_s16(tmp1
, tmp2
);
275 tmp12
= vsubq_s16(tmp1
, tmp2
);
277 row0
= vrshrq_n_s16(vaddq_s16(tmp10
, tmp11
), PASS1_BITS
);
278 row4
= vrshrq_n_s16(vsubq_s16(tmp10
, tmp11
), PASS1_BITS
);
280 tmp12_add_tmp13
= vaddq_s16(tmp12
, tmp13
);
281 z1_l
= vmull_lane_s16(vget_low_s16(tmp12_add_tmp13
), consts
.val
[0], 2);
282 z1_h
= vmull_lane_s16(vget_high_s16(tmp12_add_tmp13
), consts
.val
[0], 2);
284 int32x4_t row2_scaled_l
=
285 vmlal_lane_s16(z1_l
, vget_low_s16(tmp13
), consts
.val
[0], 3);
286 int32x4_t row2_scaled_h
=
287 vmlal_lane_s16(z1_h
, vget_high_s16(tmp13
), consts
.val
[0], 3);
288 row2
= vcombine_s16(vrshrn_n_s32(row2_scaled_l
, DESCALE_P2
),
289 vrshrn_n_s32(row2_scaled_h
, DESCALE_P2
));
291 int32x4_t row6_scaled_l
=
292 vmlal_lane_s16(z1_l
, vget_low_s16(tmp12
), consts
.val
[1], 3);
293 int32x4_t row6_scaled_h
=
294 vmlal_lane_s16(z1_h
, vget_high_s16(tmp12
), consts
.val
[1], 3);
295 row6
= vcombine_s16(vrshrn_n_s32(row6_scaled_l
, DESCALE_P2
),
296 vrshrn_n_s32(row6_scaled_h
, DESCALE_P2
));
299 z1
= vaddq_s16(tmp4
, tmp7
);
300 z2
= vaddq_s16(tmp5
, tmp6
);
301 z3
= vaddq_s16(tmp4
, tmp6
);
302 z4
= vaddq_s16(tmp5
, tmp7
);
304 z5_l
= vmull_lane_s16(vget_low_s16(z3
), consts
.val
[1], 1);
305 z5_h
= vmull_lane_s16(vget_high_s16(z3
), consts
.val
[1], 1);
306 z5_l
= vmlal_lane_s16(z5_l
, vget_low_s16(z4
), consts
.val
[1], 1);
307 z5_h
= vmlal_lane_s16(z5_h
, vget_high_s16(z4
), consts
.val
[1], 1);
309 /* sqrt(2) * (-c1+c3+c5-c7) */
310 tmp4_l
= vmull_lane_s16(vget_low_s16(tmp4
), consts
.val
[0], 0);
311 tmp4_h
= vmull_lane_s16(vget_high_s16(tmp4
), consts
.val
[0], 0);
312 /* sqrt(2) * ( c1+c3-c5+c7) */
313 tmp5_l
= vmull_lane_s16(vget_low_s16(tmp5
), consts
.val
[2], 1);
314 tmp5_h
= vmull_lane_s16(vget_high_s16(tmp5
), consts
.val
[2], 1);
315 /* sqrt(2) * ( c1+c3+c5-c7) */
316 tmp6_l
= vmull_lane_s16(vget_low_s16(tmp6
), consts
.val
[2], 3);
317 tmp6_h
= vmull_lane_s16(vget_high_s16(tmp6
), consts
.val
[2], 3);
318 /* sqrt(2) * ( c1+c3-c5-c7) */
319 tmp7_l
= vmull_lane_s16(vget_low_s16(tmp7
), consts
.val
[1], 2);
320 tmp7_h
= vmull_lane_s16(vget_high_s16(tmp7
), consts
.val
[1], 2);
322 /* sqrt(2) * (c7-c3) */
323 z1_l
= vmull_lane_s16(vget_low_s16(z1
), consts
.val
[1], 0);
324 z1_h
= vmull_lane_s16(vget_high_s16(z1
), consts
.val
[1], 0);
325 /* sqrt(2) * (-c1-c3) */
326 z2_l
= vmull_lane_s16(vget_low_s16(z2
), consts
.val
[2], 2);
327 z2_h
= vmull_lane_s16(vget_high_s16(z2
), consts
.val
[2], 2);
328 /* sqrt(2) * (-c3-c5) */
329 z3_l
= vmull_lane_s16(vget_low_s16(z3
), consts
.val
[2], 0);
330 z3_h
= vmull_lane_s16(vget_high_s16(z3
), consts
.val
[2], 0);
331 /* sqrt(2) * (c5-c3) */
332 z4_l
= vmull_lane_s16(vget_low_s16(z4
), consts
.val
[0], 1);
333 z4_h
= vmull_lane_s16(vget_high_s16(z4
), consts
.val
[0], 1);
335 z3_l
= vaddq_s32(z3_l
, z5_l
);
336 z3_h
= vaddq_s32(z3_h
, z5_h
);
337 z4_l
= vaddq_s32(z4_l
, z5_l
);
338 z4_h
= vaddq_s32(z4_h
, z5_h
);
340 tmp4_l
= vaddq_s32(tmp4_l
, z1_l
);
341 tmp4_h
= vaddq_s32(tmp4_h
, z1_h
);
342 tmp4_l
= vaddq_s32(tmp4_l
, z3_l
);
343 tmp4_h
= vaddq_s32(tmp4_h
, z3_h
);
344 row7
= vcombine_s16(vrshrn_n_s32(tmp4_l
, DESCALE_P2
),
345 vrshrn_n_s32(tmp4_h
, DESCALE_P2
));
347 tmp5_l
= vaddq_s32(tmp5_l
, z2_l
);
348 tmp5_h
= vaddq_s32(tmp5_h
, z2_h
);
349 tmp5_l
= vaddq_s32(tmp5_l
, z4_l
);
350 tmp5_h
= vaddq_s32(tmp5_h
, z4_h
);
351 row5
= vcombine_s16(vrshrn_n_s32(tmp5_l
, DESCALE_P2
),
352 vrshrn_n_s32(tmp5_h
, DESCALE_P2
));
354 tmp6_l
= vaddq_s32(tmp6_l
, z2_l
);
355 tmp6_h
= vaddq_s32(tmp6_h
, z2_h
);
356 tmp6_l
= vaddq_s32(tmp6_l
, z3_l
);
357 tmp6_h
= vaddq_s32(tmp6_h
, z3_h
);
358 row3
= vcombine_s16(vrshrn_n_s32(tmp6_l
, DESCALE_P2
),
359 vrshrn_n_s32(tmp6_h
, DESCALE_P2
));
361 tmp7_l
= vaddq_s32(tmp7_l
, z1_l
);
362 tmp7_h
= vaddq_s32(tmp7_h
, z1_h
);
363 tmp7_l
= vaddq_s32(tmp7_l
, z4_l
);
364 tmp7_h
= vaddq_s32(tmp7_h
, z4_h
);
365 row1
= vcombine_s16(vrshrn_n_s32(tmp7_l
, DESCALE_P2
),
366 vrshrn_n_s32(tmp7_h
, DESCALE_P2
));
368 vst1q_s16(data
+ 0 * DCTSIZE
, row0
);
369 vst1q_s16(data
+ 1 * DCTSIZE
, row1
);
370 vst1q_s16(data
+ 2 * DCTSIZE
, row2
);
371 vst1q_s16(data
+ 3 * DCTSIZE
, row3
);
372 vst1q_s16(data
+ 4 * DCTSIZE
, row4
);
373 vst1q_s16(data
+ 5 * DCTSIZE
, row5
);
374 vst1q_s16(data
+ 6 * DCTSIZE
, row6
);
375 vst1q_s16(data
+ 7 * DCTSIZE
, row7
);