vpx_dsp/x86/highbd_quantize_intrin_sse2.c

   1 /*
   2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <emmintrin.h>
  12
  13 #include "vpx_dsp/vpx_dsp_common.h"
  14 #include "vpx_mem/vpx_mem.h"
  15 #include "vpx_ports/mem.h"
  16
  17 #if CONFIG_VP9_HIGHBITDEPTH
  18 void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
  19                                 intptr_t count,
  20                                 int skip_block,
  21                                 const int16_t *zbin_ptr,
  22                                 const int16_t *round_ptr,
  23                                 const int16_t *quant_ptr,
  24                                 const int16_t *quant_shift_ptr,
  25                                 tran_low_t *qcoeff_ptr,
  26                                 tran_low_t *dqcoeff_ptr,
  27                                 const int16_t *dequant_ptr,
  28                                 uint16_t *eob_ptr,
  29                                 const int16_t *scan,
  30                                 const int16_t *iscan) {
  31   int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
  32   __m128i zbins[2];
  33   __m128i nzbins[2];
  34
  35   zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
  36                            (int)zbin_ptr[1],
  37                            (int)zbin_ptr[1],
  38                            (int)zbin_ptr[0]);
  39   zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
  40
  41   nzbins[0] = _mm_setzero_si128();
  42   nzbins[1] = _mm_setzero_si128();
  43   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
  44   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
  45
  46   (void)scan;
  47
  48   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
  49   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
  50
  51   if (!skip_block) {
  52     // Pre-scan pass
  53     for (i = ((int)count / 4) - 1; i >= 0; i--) {
  54       __m128i coeffs, cmp1, cmp2;
  55       int test;
  56       coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
  57       cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
  58       cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
  59       cmp1 = _mm_and_si128(cmp1, cmp2);
  60       test = _mm_movemask_epi8(cmp1);
  61       if (test == 0xffff)
  62         non_zero_regs--;
  63       else
  64         break;
  65     }
  66
  67     // Quantization pass:
  68     for (i = 0; i < non_zero_regs; i++) {
  69       __m128i coeffs, coeffs_sign, tmp1, tmp2;
  70       int test;
  71       int abs_coeff[4];
  72       int coeff_sign[4];
  73
  74       coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
  75       coeffs_sign = _mm_srai_epi32(coeffs, 31);
  76       coeffs = _mm_sub_epi32(
  77             _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
  78       tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
  79       tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
  80       tmp1 = _mm_or_si128(tmp1, tmp2);
  81       test = _mm_movemask_epi8(tmp1);
  82       _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
  83       _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
  84
  85       for (j = 0; j < 4; j++) {
  86         if (test & (1 << (4 * j))) {
  87           int k = 4 * i + j;
  88           const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
  89           const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
  90           const uint32_t abs_qcoeff =
  91               (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
  92           qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
  93           dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
  94           if (abs_qcoeff)
  95             eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
  96         }
  97       }
  98     }
  99   }
 100   *eob_ptr = eob_i + 1;
 101 }
 102
 103
 104 void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
 105                                       intptr_t n_coeffs,
 106                                       int skip_block,
 107                                       const int16_t *zbin_ptr,
 108                                       const int16_t *round_ptr,
 109                                       const int16_t *quant_ptr,
 110                                       const int16_t *quant_shift_ptr,
 111                                       tran_low_t *qcoeff_ptr,
 112                                       tran_low_t *dqcoeff_ptr,
 113                                       const int16_t *dequant_ptr,
 114                                       uint16_t *eob_ptr,
 115                                       const int16_t *scan,
 116                                       const int16_t *iscan) {
 117   __m128i zbins[2];
 118   __m128i nzbins[2];
 119   int idx = 0;
 120   int idx_arr[1024];
 121   int i, eob = -1;
 122   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
 123   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
 124   (void)scan;
 125   zbins[0] = _mm_set_epi32(zbin1_tmp,
 126                            zbin1_tmp,
 127                            zbin1_tmp,
 128                            zbin0_tmp);
 129   zbins[1] = _mm_set1_epi32(zbin1_tmp);
 130
 131   nzbins[0] = _mm_setzero_si128();
 132   nzbins[1] = _mm_setzero_si128();
 133   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
 134   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 135
 136   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
 137   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 138
 139   if (!skip_block) {
 140     // Pre-scan pass
 141     for (i = 0; i < n_coeffs / 4; i++) {
 142       __m128i coeffs, cmp1, cmp2;
 143       int test;
 144       coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
 145       cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
 146       cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
 147       cmp1 = _mm_and_si128(cmp1, cmp2);
 148       test = _mm_movemask_epi8(cmp1);
 149       if (!(test & 0xf))
 150         idx_arr[idx++] = i * 4;
 151       if (!(test & 0xf0))
 152         idx_arr[idx++] = i * 4 + 1;
 153       if (!(test & 0xf00))
 154         idx_arr[idx++] = i * 4 + 2;
 155       if (!(test & 0xf000))
 156         idx_arr[idx++] = i * 4 + 3;
 157     }
 158
 159     // Quantization pass: only process the coefficients selected in
 160     // pre-scan pass. Note: idx can be zero.
 161     for (i = 0; i < idx; i++) {
 162       const int rc = idx_arr[i];
 163       const int coeff = coeff_ptr[rc];
 164       const int coeff_sign = (coeff >> 31);
 165       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 166       const int64_t tmp1 = abs_coeff
 167                          + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
 168       const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
 169       const uint32_t abs_qcoeff =
 170           (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
 171       qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
 172       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
 173       if (abs_qcoeff)
 174         eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
 175     }
 176   }
 177   *eob_ptr = eob + 1;
 178 }
 179 #endif