2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
15 #include "vpx_mem/vpx_mem.h"
17 #include "vp9/common/vp9_quant_common.h"
18 #include "vp9/common/vp9_seg_common.h"
20 #include "vp9/encoder/vp9_encoder.h"
21 #include "vp9/encoder/vp9_quantize.h"
22 #include "vp9/encoder/vp9_rd.h"
24 void vp9_quantize_fp_neon(const int16_t *coeff_ptr
, intptr_t count
,
25 int skip_block
, const int16_t *zbin_ptr
,
26 const int16_t *round_ptr
, const int16_t *quant_ptr
,
27 const int16_t *quant_shift_ptr
, int16_t *qcoeff_ptr
,
28 int16_t *dqcoeff_ptr
, const int16_t *dequant_ptr
,
29 int zbin_oq_value
, uint16_t *eob_ptr
,
30 const int16_t *scan
, const int16_t *iscan
) {
31 // TODO(jingning) Decide the need of these arguments after the
32 // quantization process is completed.
34 (void)quant_shift_ptr
;
39 // Quantization pass: All coefficients with index >= zero_flag are
40 // skippable. Note: zero_flag can be zero.
42 const int16x8_t v_zero
= vdupq_n_s16(0);
43 const int16x8_t v_one
= vdupq_n_s16(1);
44 int16x8_t v_eobmax_76543210
= vdupq_n_s16(-1);
45 int16x8_t v_round
= vmovq_n_s16(round_ptr
[1]);
46 int16x8_t v_quant
= vmovq_n_s16(quant_ptr
[1]);
47 int16x8_t v_dequant
= vmovq_n_s16(dequant_ptr
[1]);
49 v_round
= vsetq_lane_s16(round_ptr
[0], v_round
, 0);
50 v_quant
= vsetq_lane_s16(quant_ptr
[0], v_quant
, 0);
51 v_dequant
= vsetq_lane_s16(dequant_ptr
[0], v_dequant
, 0);
52 // process dc and the first seven ac coeffs
54 const int16x8_t v_iscan
= vld1q_s16(&iscan
[0]);
55 const int16x8_t v_coeff
= vld1q_s16(&coeff_ptr
[0]);
56 const int16x8_t v_coeff_sign
= vshrq_n_s16(v_coeff
, 15);
57 const int16x8_t v_tmp
= vabaq_s16(v_round
, v_coeff
, v_zero
);
58 const int32x4_t v_tmp_lo
= vmull_s16(vget_low_s16(v_tmp
),
59 vget_low_s16(v_quant
));
60 const int32x4_t v_tmp_hi
= vmull_s16(vget_high_s16(v_tmp
),
61 vget_high_s16(v_quant
));
62 const int16x8_t v_tmp2
= vcombine_s16(vshrn_n_s32(v_tmp_lo
, 16),
63 vshrn_n_s32(v_tmp_hi
, 16));
64 const uint16x8_t v_nz_mask
= vceqq_s16(v_tmp2
, v_zero
);
65 const int16x8_t v_iscan_plus1
= vaddq_s16(v_iscan
, v_one
);
66 const int16x8_t v_nz_iscan
= vbslq_s16(v_nz_mask
, v_zero
, v_iscan_plus1
);
67 const int16x8_t v_qcoeff_a
= veorq_s16(v_tmp2
, v_coeff_sign
);
68 const int16x8_t v_qcoeff
= vsubq_s16(v_qcoeff_a
, v_coeff_sign
);
69 const int16x8_t v_dqcoeff
= vmulq_s16(v_qcoeff
, v_dequant
);
70 v_eobmax_76543210
= vmaxq_s16(v_eobmax_76543210
, v_nz_iscan
);
71 vst1q_s16(&qcoeff_ptr
[0], v_qcoeff
);
72 vst1q_s16(&dqcoeff_ptr
[0], v_dqcoeff
);
73 v_round
= vmovq_n_s16(round_ptr
[1]);
74 v_quant
= vmovq_n_s16(quant_ptr
[1]);
75 v_dequant
= vmovq_n_s16(dequant_ptr
[1]);
77 // now process the rest of the ac coeffs
78 for (i
= 8; i
< count
; i
+= 8) {
79 const int16x8_t v_iscan
= vld1q_s16(&iscan
[i
]);
80 const int16x8_t v_coeff
= vld1q_s16(&coeff_ptr
[i
]);
81 const int16x8_t v_coeff_sign
= vshrq_n_s16(v_coeff
, 15);
82 const int16x8_t v_tmp
= vabaq_s16(v_round
, v_coeff
, v_zero
);
83 const int32x4_t v_tmp_lo
= vmull_s16(vget_low_s16(v_tmp
),
84 vget_low_s16(v_quant
));
85 const int32x4_t v_tmp_hi
= vmull_s16(vget_high_s16(v_tmp
),
86 vget_high_s16(v_quant
));
87 const int16x8_t v_tmp2
= vcombine_s16(vshrn_n_s32(v_tmp_lo
, 16),
88 vshrn_n_s32(v_tmp_hi
, 16));
89 const uint16x8_t v_nz_mask
= vceqq_s16(v_tmp2
, v_zero
);
90 const int16x8_t v_iscan_plus1
= vaddq_s16(v_iscan
, v_one
);
91 const int16x8_t v_nz_iscan
= vbslq_s16(v_nz_mask
, v_zero
, v_iscan_plus1
);
92 const int16x8_t v_qcoeff_a
= veorq_s16(v_tmp2
, v_coeff_sign
);
93 const int16x8_t v_qcoeff
= vsubq_s16(v_qcoeff_a
, v_coeff_sign
);
94 const int16x8_t v_dqcoeff
= vmulq_s16(v_qcoeff
, v_dequant
);
95 v_eobmax_76543210
= vmaxq_s16(v_eobmax_76543210
, v_nz_iscan
);
96 vst1q_s16(&qcoeff_ptr
[i
], v_qcoeff
);
97 vst1q_s16(&dqcoeff_ptr
[i
], v_dqcoeff
);
100 const int16x4_t v_eobmax_3210
=
101 vmax_s16(vget_low_s16(v_eobmax_76543210
),
102 vget_high_s16(v_eobmax_76543210
));
103 const int64x1_t v_eobmax_xx32
=
104 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210
), 32);
105 const int16x4_t v_eobmax_tmp
=
106 vmax_s16(v_eobmax_3210
, vreinterpret_s16_s64(v_eobmax_xx32
));
107 const int64x1_t v_eobmax_xxx3
=
108 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp
), 16);
109 const int16x4_t v_eobmax_final
=
110 vmax_s16(v_eobmax_tmp
, vreinterpret_s16_s64(v_eobmax_xxx3
));
112 *eob_ptr
= (uint16_t)vget_lane_s16(v_eobmax_final
, 0);
115 vpx_memset(qcoeff_ptr
, 0, count
* sizeof(int16_t));
116 vpx_memset(dqcoeff_ptr
, 0, count
* sizeof(int16_t));