Refactor vp9_idct.h file
[aom.git] / vpx_dsp / x86 / highbd_quantize_intrin_sse2.c
blob341283feb8c1847d8d1fc304efacae1ffd02d20a
1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
11 #include <emmintrin.h>
13 #include "vpx_dsp/vpx_dsp_common.h"
14 #include "vpx_mem/vpx_mem.h"
15 #include "vpx_ports/mem.h"
17 #if CONFIG_VP9_HIGHBITDEPTH
18 void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
19 intptr_t count,
20 int skip_block,
21 const int16_t *zbin_ptr,
22 const int16_t *round_ptr,
23 const int16_t *quant_ptr,
24 const int16_t *quant_shift_ptr,
25 tran_low_t *qcoeff_ptr,
26 tran_low_t *dqcoeff_ptr,
27 const int16_t *dequant_ptr,
28 uint16_t *eob_ptr,
29 const int16_t *scan,
30 const int16_t *iscan) {
31 int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
32 __m128i zbins[2];
33 __m128i nzbins[2];
35 zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
36 (int)zbin_ptr[1],
37 (int)zbin_ptr[1],
38 (int)zbin_ptr[0]);
39 zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
41 nzbins[0] = _mm_setzero_si128();
42 nzbins[1] = _mm_setzero_si128();
43 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
44 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
46 (void)scan;
48 memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
49 memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
51 if (!skip_block) {
52 // Pre-scan pass
53 for (i = ((int)count / 4) - 1; i >= 0; i--) {
54 __m128i coeffs, cmp1, cmp2;
55 int test;
56 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
57 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
58 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
59 cmp1 = _mm_and_si128(cmp1, cmp2);
60 test = _mm_movemask_epi8(cmp1);
61 if (test == 0xffff)
62 non_zero_regs--;
63 else
64 break;
67 // Quantization pass:
68 for (i = 0; i < non_zero_regs; i++) {
69 __m128i coeffs, coeffs_sign, tmp1, tmp2;
70 int test;
71 int abs_coeff[4];
72 int coeff_sign[4];
74 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
75 coeffs_sign = _mm_srai_epi32(coeffs, 31);
76 coeffs = _mm_sub_epi32(
77 _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
78 tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
79 tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
80 tmp1 = _mm_or_si128(tmp1, tmp2);
81 test = _mm_movemask_epi8(tmp1);
82 _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
83 _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
85 for (j = 0; j < 4; j++) {
86 if (test & (1 << (4 * j))) {
87 int k = 4 * i + j;
88 const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
89 const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
90 const uint32_t abs_qcoeff =
91 (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
92 qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
93 dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
94 if (abs_qcoeff)
95 eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
100 *eob_ptr = eob_i + 1;
104 void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
105 intptr_t n_coeffs,
106 int skip_block,
107 const int16_t *zbin_ptr,
108 const int16_t *round_ptr,
109 const int16_t *quant_ptr,
110 const int16_t *quant_shift_ptr,
111 tran_low_t *qcoeff_ptr,
112 tran_low_t *dqcoeff_ptr,
113 const int16_t *dequant_ptr,
114 uint16_t *eob_ptr,
115 const int16_t *scan,
116 const int16_t *iscan) {
117 __m128i zbins[2];
118 __m128i nzbins[2];
119 int idx = 0;
120 int idx_arr[1024];
121 int i, eob = -1;
122 const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
123 const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
124 (void)scan;
125 zbins[0] = _mm_set_epi32(zbin1_tmp,
126 zbin1_tmp,
127 zbin1_tmp,
128 zbin0_tmp);
129 zbins[1] = _mm_set1_epi32(zbin1_tmp);
131 nzbins[0] = _mm_setzero_si128();
132 nzbins[1] = _mm_setzero_si128();
133 nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
134 nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
136 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
137 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
139 if (!skip_block) {
140 // Pre-scan pass
141 for (i = 0; i < n_coeffs / 4; i++) {
142 __m128i coeffs, cmp1, cmp2;
143 int test;
144 coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
145 cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
146 cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
147 cmp1 = _mm_and_si128(cmp1, cmp2);
148 test = _mm_movemask_epi8(cmp1);
149 if (!(test & 0xf))
150 idx_arr[idx++] = i * 4;
151 if (!(test & 0xf0))
152 idx_arr[idx++] = i * 4 + 1;
153 if (!(test & 0xf00))
154 idx_arr[idx++] = i * 4 + 2;
155 if (!(test & 0xf000))
156 idx_arr[idx++] = i * 4 + 3;
159 // Quantization pass: only process the coefficients selected in
160 // pre-scan pass. Note: idx can be zero.
161 for (i = 0; i < idx; i++) {
162 const int rc = idx_arr[i];
163 const int coeff = coeff_ptr[rc];
164 const int coeff_sign = (coeff >> 31);
165 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
166 const int64_t tmp1 = abs_coeff
167 + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
168 const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
169 const uint32_t abs_qcoeff =
170 (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
171 qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
172 dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
173 if (abs_qcoeff)
174 eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
177 *eob_ptr = eob + 1;
179 #endif