From 7cb2db1c14c8e324123400356920879517ca3aba Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 12 Mar 2018 06:49:03 -0700 Subject: [PATCH] Add ssse3 version of aom_smooth_predictor_wxh() for width or height equal to 64. Change-Id: Ib911ec16b66cba225b3bae536d5c651bce5ae422 --- aom_dsp/aom_dsp_rtcd_defs.pl | 4 ++ aom_dsp/x86/intrapred_ssse3.c | 85 +++++++++++++++++++++++++++++++++++++++++++ test/test_intra_pred_speed.cc | 12 ++++-- 3 files changed, 97 insertions(+), 4 deletions(-) diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index f9ab56b6df..580cfb14cd 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -177,6 +177,10 @@ specialize qw/aom_smooth_predictor_16x16 ssse3/; specialize qw/aom_smooth_predictor_16x32 ssse3/; specialize qw/aom_smooth_predictor_32x16 ssse3/; specialize qw/aom_smooth_predictor_32x32 ssse3/; +specialize qw/aom_smooth_predictor_32x64 ssse3/; +specialize qw/aom_smooth_predictor_64x64 ssse3/; +specialize qw/aom_smooth_predictor_64x32 ssse3/; +specialize qw/aom_smooth_predictor_64x16 ssse3/; # TODO(yunqingwang): optimize rectangular DC_PRED to replace division # by multiply and shift. diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c index 5a96048a43..b8313406b1 100644 --- a/aom_dsp/x86/intrapred_ssse3.c +++ b/aom_dsp/x86/intrapred_ssse3.c @@ -1039,3 +1039,88 @@ void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, dst += stride << 3; smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3); } + +static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]); + const __m128i dup16 = + _mm_set_epi32(0x01000100, 0x01000100, 0x01000100, 0x01000100); + const __m128i top_right = + _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale)); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); + const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); + const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); + __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left); + const __m128i wl_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); + pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round); + pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); + const __m128i weights_x = + _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); + const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x); + const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero); + const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero); + + __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); + __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); + + const __m128i scale_m_weights_x = + _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero)); + const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right); + const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero); + const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero); + + pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl); + pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl); + + pred_lo = _mm_add_epi32(pred_lo, swxtr_lo); + pred_hi = _mm_add_epi32(pred_hi, swxtr_hi); + + pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale)); + pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale)); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 16); +} diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 9a27525c2e..bfc7a43a29 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -702,7 +702,8 @@ INTRA_PRED_TEST(SSSE3_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL, aom_paeth_predictor_32x16_ssse3, aom_smooth_predictor_32x16_ssse3, NULL, NULL) INTRA_PRED_TEST(SSSE3_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL, - aom_paeth_predictor_32x64_ssse3, NULL, NULL, NULL) + aom_paeth_predictor_32x64_ssse3, + aom_smooth_predictor_32x64_ssse3, NULL, NULL) #endif // HAVE_SSSE3 #if HAVE_AVX2 @@ -783,11 +784,14 @@ INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2, #if HAVE_SSSE3 INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL, - aom_paeth_predictor_64x64_ssse3, NULL, NULL, NULL) + aom_paeth_predictor_64x64_ssse3, + aom_smooth_predictor_64x64_ssse3, NULL, NULL) INTRA_PRED_TEST(SSSE3_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL, - aom_paeth_predictor_64x32_ssse3, NULL, NULL, NULL) + aom_paeth_predictor_64x32_ssse3, + aom_smooth_predictor_64x32_ssse3, NULL, NULL) INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL, - aom_paeth_predictor_64x16_ssse3, NULL, NULL, NULL) + aom_paeth_predictor_64x16_ssse3, + aom_smooth_predictor_64x16_ssse3, NULL, NULL) #endif #if HAVE_AVX2 -- 2.11.4.GIT