From 1ac91a884c3bdb5f3214d4320e1f61dbbabba428 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Fri, 20 Apr 2018 13:04:14 -0700 Subject: [PATCH] BUG FIX: av1_quantize_fp_sse2() matches the c/avx2 version. BUG=aomedia:1668 Change-Id: Id2197661742416ba649c16eac44aa57f976f6e59 --- av1/encoder/x86/av1_quantize_sse2.c | 221 ++++++++++++++---------------------- test/quantize_func_test.cc | 2 +- 2 files changed, 84 insertions(+), 139 deletions(-) diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c index 0f10e3873c..8da2e2103e 100644 --- a/av1/encoder/x86/av1_quantize_sse2.c +++ b/av1/encoder/x86/av1_quantize_sse2.c @@ -67,6 +67,73 @@ static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) { } } +static INLINE void quantize(const int16_t *iscan_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const __m128i *round0, const __m128i *round1, + const __m128i *quant0, const __m128i *quant1, + const __m128i *dequant0, const __m128i *dequant1, + const __m128i *thr0, const __m128i *thr1, + __m128i *eob) { + __m128i coeff0, coeff1; + // Do DC and first 15 AC + read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); + + // Poor man's sign extract + const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); + const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); + __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0), + _mm_cmpeq_epi16(qcoeff0, *thr0)); + const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1), + _mm_cmpeq_epi16(qcoeff1, *thr1)); + const int16_t nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); + qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); + const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); + const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); + + coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); + coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); + + write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); + + const __m128i zero = _mm_setzero_si128(); + // Scan for eob + const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + const __m128i iscan0 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + const __m128i iscan1 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); + const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); + const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); + const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); + const __m128i eob2 = _mm_max_epi16(eob0, eob1); + *eob = _mm_max_epi16(*eob, eob2); + } else { + write_zero(qcoeff_ptr, n_coeffs); + write_zero(dqcoeff_ptr, n_coeffs); + } +} + void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -74,9 +141,6 @@ void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; - __m128i thr; - int16_t nzflag; (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; @@ -86,146 +150,27 @@ void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - __m128i eob; - __m128i round, quant, dequant; - { - __m128i coeff0, coeff1; - - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } + const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); + const __m128i round1 = _mm_unpackhi_epi64(round0, round0); + const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); + const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); + const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); + const __m128i thr0 = _mm_srai_epi16(dequant0, 1); + const __m128i thr1 = _mm_srai_epi16(dequant1, 1); + __m128i eob = _mm_setzero_si128(); + + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, + &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob); - thr = _mm_srai_epi16(dequant, 1); + n_coeffs += 8 * 2; // AC only loop while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); - } else { - write_zero(qcoeff_ptr, n_coeffs); - write_zero(dqcoeff_ptr, n_coeffs); - } - } - - if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, + &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1, + &eob); n_coeffs += 8 * 2; } diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc index 7936334c89..1efd8a2c9d 100644 --- a/test/quantize_func_test.cc +++ b/test/quantize_func_test.cc @@ -277,7 +277,7 @@ TEST_P(QuantizeTest, MultipleQ) { // Force the coeff to be half the value of the dequant. This exposes a // mismatch found in av1_quantize_fp_sse2(). -TEST_P(QuantizeTest, DISABLED_CoeffHalfDequant) { +TEST_P(QuantizeTest, CoeffHalfDequant) { FillCoeff(16); QuantizeRun(false, 25, 1); } -- 2.11.4.GIT