media/libjpeg/simd/arm/jfdctint-neon.c

   1 /*
   2  * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
   3  *
   4  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
   5  * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
   6  *
   7  * This software is provided 'as-is', without any express or implied
   8  * warranty.  In no event will the authors be held liable for any damages
   9  * arising from the use of this software.
  10  *
  11  * Permission is granted to anyone to use this software for any purpose,
  12  * including commercial applications, and to alter it and redistribute it
  13  * freely, subject to the following restrictions:
  14  *
  15  * 1. The origin of this software must not be misrepresented; you must not
  16  *    claim that you wrote the original software. If you use this software
  17  *    in a product, an acknowledgment in the product documentation would be
  18  *    appreciated but is not required.
  19  * 2. Altered source versions must be plainly marked as such, and must not be
  20  *    misrepresented as being the original software.
  21  * 3. This notice may not be removed or altered from any source distribution.
  22  */
  23
  24 #define JPEG_INTERNALS
  25 #include "../../jinclude.h"
  26 #include "../../jpeglib.h"
  27 #include "../../jsimd.h"
  28 #include "../../jdct.h"
  29 #include "../../jsimddct.h"
  30 #include "../jsimd.h"
  31 #include "align.h"
  32 #include "neon-compat.h"
  33
  34 #include <arm_neon.h>
  35
  36
  37 /* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
  38  * (Discrete Cosine Transform) on one block of samples.  It uses the same
  39  * calculations and produces exactly the same output as IJG's original
  40  * jpeg_fdct_islow() function, which can be found in jfdctint.c.
  41  *
  42  * Scaled integer constants are used to avoid floating-point arithmetic:
  43  *    0.298631336 =  2446 * 2^-13
  44  *    0.390180644 =  3196 * 2^-13
  45  *    0.541196100 =  4433 * 2^-13
  46  *    0.765366865 =  6270 * 2^-13
  47  *    0.899976223 =  7373 * 2^-13
  48  *    1.175875602 =  9633 * 2^-13
  49  *    1.501321110 = 12299 * 2^-13
  50  *    1.847759065 = 15137 * 2^-13
  51  *    1.961570560 = 16069 * 2^-13
  52  *    2.053119869 = 16819 * 2^-13
  53  *    2.562915447 = 20995 * 2^-13
  54  *    3.072711026 = 25172 * 2^-13
  55  *
  56  * See jfdctint.c for further details of the DCT algorithm.  Where possible,
  57  * the variable names and comments here in jsimd_fdct_islow_neon() match up
  58  * with those in jpeg_fdct_islow().
  59  */
  60
  61 #define CONST_BITS  13
  62 #define PASS1_BITS  2
  63
  64 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
  65 #define DESCALE_P2  (CONST_BITS + PASS1_BITS)
  66
  67 #define F_0_298  2446
  68 #define F_0_390  3196
  69 #define F_0_541  4433
  70 #define F_0_765  6270
  71 #define F_0_899  7373
  72 #define F_1_175  9633
  73 #define F_1_501  12299
  74 #define F_1_847  15137
  75 #define F_1_961  16069
  76 #define F_2_053  16819
  77 #define F_2_562  20995
  78 #define F_3_072  25172
  79
  80
  81 ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
  82   F_0_298, -F_0_390,  F_0_541,  F_0_765,
  83  -F_0_899,  F_1_175,  F_1_501, -F_1_847,
  84  -F_1_961,  F_2_053, -F_2_562,  F_3_072
  85 };
  86
  87 void jsimd_fdct_islow_neon(DCTELEM *data)
  88 {
  89   /* Load DCT constants. */
  90 #ifdef HAVE_VLD1_S16_X3
  91   const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
  92 #else
  93   /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
  94   const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
  95   const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
  96   const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
  97   const int16x4x3_t consts = { { consts1, consts2, consts3 } };
  98 #endif
  99
 100   /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
 101    * are used, followed by vuzp to transpose the block such that we have a
 102    * column of samples per vector - allowing all rows to be processed at once.
 103    */
 104   int16x8x4_t s_rows_0123 = vld4q_s16(data);
 105   int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
 106
 107   int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
 108   int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
 109   int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
 110   int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
 111
 112   int16x8_t col0 = cols_04.val[0];
 113   int16x8_t col1 = cols_15.val[0];
 114   int16x8_t col2 = cols_26.val[0];
 115   int16x8_t col3 = cols_37.val[0];
 116   int16x8_t col4 = cols_04.val[1];
 117   int16x8_t col5 = cols_15.val[1];
 118   int16x8_t col6 = cols_26.val[1];
 119   int16x8_t col7 = cols_37.val[1];
 120
 121   /* Pass 1: process rows. */
 122
 123   int16x8_t tmp0 = vaddq_s16(col0, col7);
 124   int16x8_t tmp7 = vsubq_s16(col0, col7);
 125   int16x8_t tmp1 = vaddq_s16(col1, col6);
 126   int16x8_t tmp6 = vsubq_s16(col1, col6);
 127   int16x8_t tmp2 = vaddq_s16(col2, col5);
 128   int16x8_t tmp5 = vsubq_s16(col2, col5);
 129   int16x8_t tmp3 = vaddq_s16(col3, col4);
 130   int16x8_t tmp4 = vsubq_s16(col3, col4);
 131
 132   /* Even part */
 133   int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
 134   int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
 135   int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
 136   int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
 137
 138   col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
 139   col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
 140
 141   int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
 142   int32x4_t z1_l =
 143     vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
 144   int32x4_t z1_h =
 145     vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
 146
 147   int32x4_t col2_scaled_l =
 148     vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
 149   int32x4_t col2_scaled_h =
 150     vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
 151   col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
 152                       vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
 153
 154   int32x4_t col6_scaled_l =
 155     vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
 156   int32x4_t col6_scaled_h =
 157     vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
 158   col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
 159                       vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
 160
 161   /* Odd part */
 162   int16x8_t z1 = vaddq_s16(tmp4, tmp7);
 163   int16x8_t z2 = vaddq_s16(tmp5, tmp6);
 164   int16x8_t z3 = vaddq_s16(tmp4, tmp6);
 165   int16x8_t z4 = vaddq_s16(tmp5, tmp7);
 166   /* sqrt(2) * c3 */
 167   int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
 168   int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
 169   z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
 170   z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
 171
 172   /* sqrt(2) * (-c1+c3+c5-c7) */
 173   int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
 174   int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
 175   /* sqrt(2) * ( c1+c3-c5+c7) */
 176   int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
 177   int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
 178   /* sqrt(2) * ( c1+c3+c5-c7) */
 179   int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
 180   int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
 181   /* sqrt(2) * ( c1+c3-c5-c7) */
 182   int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
 183   int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
 184
 185   /* sqrt(2) * (c7-c3) */
 186   z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
 187   z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
 188   /* sqrt(2) * (-c1-c3) */
 189   int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
 190   int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
 191   /* sqrt(2) * (-c3-c5) */
 192   int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
 193   int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
 194   /* sqrt(2) * (c5-c3) */
 195   int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
 196   int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
 197
 198   z3_l = vaddq_s32(z3_l, z5_l);
 199   z3_h = vaddq_s32(z3_h, z5_h);
 200   z4_l = vaddq_s32(z4_l, z5_l);
 201   z4_h = vaddq_s32(z4_h, z5_h);
 202
 203   tmp4_l = vaddq_s32(tmp4_l, z1_l);
 204   tmp4_h = vaddq_s32(tmp4_h, z1_h);
 205   tmp4_l = vaddq_s32(tmp4_l, z3_l);
 206   tmp4_h = vaddq_s32(tmp4_h, z3_h);
 207   col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
 208                       vrshrn_n_s32(tmp4_h, DESCALE_P1));
 209
 210   tmp5_l = vaddq_s32(tmp5_l, z2_l);
 211   tmp5_h = vaddq_s32(tmp5_h, z2_h);
 212   tmp5_l = vaddq_s32(tmp5_l, z4_l);
 213   tmp5_h = vaddq_s32(tmp5_h, z4_h);
 214   col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
 215                       vrshrn_n_s32(tmp5_h, DESCALE_P1));
 216
 217   tmp6_l = vaddq_s32(tmp6_l, z2_l);
 218   tmp6_h = vaddq_s32(tmp6_h, z2_h);
 219   tmp6_l = vaddq_s32(tmp6_l, z3_l);
 220   tmp6_h = vaddq_s32(tmp6_h, z3_h);
 221   col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
 222                       vrshrn_n_s32(tmp6_h, DESCALE_P1));
 223
 224   tmp7_l = vaddq_s32(tmp7_l, z1_l);
 225   tmp7_h = vaddq_s32(tmp7_h, z1_h);
 226   tmp7_l = vaddq_s32(tmp7_l, z4_l);
 227   tmp7_h = vaddq_s32(tmp7_h, z4_h);
 228   col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
 229                       vrshrn_n_s32(tmp7_h, DESCALE_P1));
 230
 231   /* Transpose to work on columns in pass 2. */
 232   int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
 233   int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
 234   int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
 235   int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
 236
 237   int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
 238                                       vreinterpretq_s32_s16(cols_45.val[0]));
 239   int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
 240                                       vreinterpretq_s32_s16(cols_45.val[1]));
 241   int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
 242                                       vreinterpretq_s32_s16(cols_67.val[0]));
 243   int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
 244                                       vreinterpretq_s32_s16(cols_67.val[1]));
 245
 246   int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
 247   int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
 248   int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
 249   int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
 250
 251   int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
 252   int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
 253   int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
 254   int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
 255   int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
 256   int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
 257   int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
 258   int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
 259
 260   /* Pass 2: process columns. */
 261
 262   tmp0 = vaddq_s16(row0, row7);
 263   tmp7 = vsubq_s16(row0, row7);
 264   tmp1 = vaddq_s16(row1, row6);
 265   tmp6 = vsubq_s16(row1, row6);
 266   tmp2 = vaddq_s16(row2, row5);
 267   tmp5 = vsubq_s16(row2, row5);
 268   tmp3 = vaddq_s16(row3, row4);
 269   tmp4 = vsubq_s16(row3, row4);
 270
 271   /* Even part */
 272   tmp10 = vaddq_s16(tmp0, tmp3);
 273   tmp13 = vsubq_s16(tmp0, tmp3);
 274   tmp11 = vaddq_s16(tmp1, tmp2);
 275   tmp12 = vsubq_s16(tmp1, tmp2);
 276
 277   row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
 278   row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
 279
 280   tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
 281   z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
 282   z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
 283
 284   int32x4_t row2_scaled_l =
 285     vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
 286   int32x4_t row2_scaled_h =
 287     vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
 288   row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
 289                       vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
 290
 291   int32x4_t row6_scaled_l =
 292     vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
 293   int32x4_t row6_scaled_h =
 294     vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
 295   row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
 296                       vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
 297
 298   /* Odd part */
 299   z1 = vaddq_s16(tmp4, tmp7);
 300   z2 = vaddq_s16(tmp5, tmp6);
 301   z3 = vaddq_s16(tmp4, tmp6);
 302   z4 = vaddq_s16(tmp5, tmp7);
 303   /* sqrt(2) * c3 */
 304   z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
 305   z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
 306   z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
 307   z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
 308
 309   /* sqrt(2) * (-c1+c3+c5-c7) */
 310   tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
 311   tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
 312   /* sqrt(2) * ( c1+c3-c5+c7) */
 313   tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
 314   tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
 315   /* sqrt(2) * ( c1+c3+c5-c7) */
 316   tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
 317   tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
 318   /* sqrt(2) * ( c1+c3-c5-c7) */
 319   tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
 320   tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
 321
 322   /* sqrt(2) * (c7-c3) */
 323   z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
 324   z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
 325   /* sqrt(2) * (-c1-c3) */
 326   z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
 327   z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
 328   /* sqrt(2) * (-c3-c5) */
 329   z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
 330   z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
 331   /* sqrt(2) * (c5-c3) */
 332   z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
 333   z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
 334
 335   z3_l = vaddq_s32(z3_l, z5_l);
 336   z3_h = vaddq_s32(z3_h, z5_h);
 337   z4_l = vaddq_s32(z4_l, z5_l);
 338   z4_h = vaddq_s32(z4_h, z5_h);
 339
 340   tmp4_l = vaddq_s32(tmp4_l, z1_l);
 341   tmp4_h = vaddq_s32(tmp4_h, z1_h);
 342   tmp4_l = vaddq_s32(tmp4_l, z3_l);
 343   tmp4_h = vaddq_s32(tmp4_h, z3_h);
 344   row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
 345                       vrshrn_n_s32(tmp4_h, DESCALE_P2));
 346
 347   tmp5_l = vaddq_s32(tmp5_l, z2_l);
 348   tmp5_h = vaddq_s32(tmp5_h, z2_h);
 349   tmp5_l = vaddq_s32(tmp5_l, z4_l);
 350   tmp5_h = vaddq_s32(tmp5_h, z4_h);
 351   row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
 352                       vrshrn_n_s32(tmp5_h, DESCALE_P2));
 353
 354   tmp6_l = vaddq_s32(tmp6_l, z2_l);
 355   tmp6_h = vaddq_s32(tmp6_h, z2_h);
 356   tmp6_l = vaddq_s32(tmp6_l, z3_l);
 357   tmp6_h = vaddq_s32(tmp6_h, z3_h);
 358   row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
 359                       vrshrn_n_s32(tmp6_h, DESCALE_P2));
 360
 361   tmp7_l = vaddq_s32(tmp7_l, z1_l);
 362   tmp7_h = vaddq_s32(tmp7_h, z1_h);
 363   tmp7_l = vaddq_s32(tmp7_l, z4_l);
 364   tmp7_h = vaddq_s32(tmp7_h, z4_h);
 365   row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
 366                       vrshrn_n_s32(tmp7_h, DESCALE_P2));
 367
 368   vst1q_s16(data + 0 * DCTSIZE, row0);
 369   vst1q_s16(data + 1 * DCTSIZE, row1);
 370   vst1q_s16(data + 2 * DCTSIZE, row2);
 371   vst1q_s16(data + 3 * DCTSIZE, row3);
 372   vst1q_s16(data + 4 * DCTSIZE, row4);
 373   vst1q_s16(data + 5 * DCTSIZE, row5);
 374   vst1q_s16(data + 6 * DCTSIZE, row6);
 375   vst1q_s16(data + 7 * DCTSIZE, row7);
 376 }