2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include <emmintrin.h>
13 #include "vpx_dsp/vpx_dsp_common.h"
14 #include "vpx_mem/vpx_mem.h"
15 #include "vpx_ports/mem.h"
17 #if CONFIG_VP9_HIGHBITDEPTH
18 void vp9_highbd_quantize_b_sse2(const tran_low_t
*coeff_ptr
,
21 const int16_t *zbin_ptr
,
22 const int16_t *round_ptr
,
23 const int16_t *quant_ptr
,
24 const int16_t *quant_shift_ptr
,
25 tran_low_t
*qcoeff_ptr
,
26 tran_low_t
*dqcoeff_ptr
,
27 const int16_t *dequant_ptr
,
30 const int16_t *iscan
) {
31 int i
, j
, non_zero_regs
= (int)count
/ 4, eob_i
= -1;
35 zbins
[0] = _mm_set_epi32((int)zbin_ptr
[1],
39 zbins
[1] = _mm_set1_epi32((int)zbin_ptr
[1]);
41 nzbins
[0] = _mm_setzero_si128();
42 nzbins
[1] = _mm_setzero_si128();
43 nzbins
[0] = _mm_sub_epi32(nzbins
[0], zbins
[0]);
44 nzbins
[1] = _mm_sub_epi32(nzbins
[1], zbins
[1]);
48 memset(qcoeff_ptr
, 0, count
* sizeof(*qcoeff_ptr
));
49 memset(dqcoeff_ptr
, 0, count
* sizeof(*dqcoeff_ptr
));
53 for (i
= ((int)count
/ 4) - 1; i
>= 0; i
--) {
54 __m128i coeffs
, cmp1
, cmp2
;
56 coeffs
= _mm_load_si128((const __m128i
*)(coeff_ptr
+ i
* 4));
57 cmp1
= _mm_cmplt_epi32(coeffs
, zbins
[i
!= 0]);
58 cmp2
= _mm_cmpgt_epi32(coeffs
, nzbins
[i
!= 0]);
59 cmp1
= _mm_and_si128(cmp1
, cmp2
);
60 test
= _mm_movemask_epi8(cmp1
);
68 for (i
= 0; i
< non_zero_regs
; i
++) {
69 __m128i coeffs
, coeffs_sign
, tmp1
, tmp2
;
74 coeffs
= _mm_load_si128((const __m128i
*)(coeff_ptr
+ i
* 4));
75 coeffs_sign
= _mm_srai_epi32(coeffs
, 31);
76 coeffs
= _mm_sub_epi32(
77 _mm_xor_si128(coeffs
, coeffs_sign
), coeffs_sign
);
78 tmp1
= _mm_cmpgt_epi32(coeffs
, zbins
[i
!= 0]);
79 tmp2
= _mm_cmpeq_epi32(coeffs
, zbins
[i
!= 0]);
80 tmp1
= _mm_or_si128(tmp1
, tmp2
);
81 test
= _mm_movemask_epi8(tmp1
);
82 _mm_storeu_si128((__m128i
*)abs_coeff
, coeffs
);
83 _mm_storeu_si128((__m128i
*)coeff_sign
, coeffs_sign
);
85 for (j
= 0; j
< 4; j
++) {
86 if (test
& (1 << (4 * j
))) {
88 const int64_t tmp1
= abs_coeff
[j
] + round_ptr
[k
!= 0];
89 const int64_t tmp2
= ((tmp1
* quant_ptr
[k
!= 0]) >> 16) + tmp1
;
90 const uint32_t abs_qcoeff
=
91 (uint32_t)((tmp2
* quant_shift_ptr
[k
!= 0]) >> 16);
92 qcoeff_ptr
[k
] = (int)(abs_qcoeff
^ coeff_sign
[j
]) - coeff_sign
[j
];
93 dqcoeff_ptr
[k
] = qcoeff_ptr
[k
] * dequant_ptr
[k
!= 0];
95 eob_i
= iscan
[k
] > eob_i
? iscan
[k
] : eob_i
;
100 *eob_ptr
= eob_i
+ 1;
104 void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t
*coeff_ptr
,
107 const int16_t *zbin_ptr
,
108 const int16_t *round_ptr
,
109 const int16_t *quant_ptr
,
110 const int16_t *quant_shift_ptr
,
111 tran_low_t
*qcoeff_ptr
,
112 tran_low_t
*dqcoeff_ptr
,
113 const int16_t *dequant_ptr
,
116 const int16_t *iscan
) {
122 const int zbin0_tmp
= ROUND_POWER_OF_TWO(zbin_ptr
[0], 1);
123 const int zbin1_tmp
= ROUND_POWER_OF_TWO(zbin_ptr
[1], 1);
125 zbins
[0] = _mm_set_epi32(zbin1_tmp
,
129 zbins
[1] = _mm_set1_epi32(zbin1_tmp
);
131 nzbins
[0] = _mm_setzero_si128();
132 nzbins
[1] = _mm_setzero_si128();
133 nzbins
[0] = _mm_sub_epi32(nzbins
[0], zbins
[0]);
134 nzbins
[1] = _mm_sub_epi32(nzbins
[1], zbins
[1]);
136 memset(qcoeff_ptr
, 0, n_coeffs
* sizeof(*qcoeff_ptr
));
137 memset(dqcoeff_ptr
, 0, n_coeffs
* sizeof(*dqcoeff_ptr
));
141 for (i
= 0; i
< n_coeffs
/ 4; i
++) {
142 __m128i coeffs
, cmp1
, cmp2
;
144 coeffs
= _mm_load_si128((const __m128i
*)(coeff_ptr
+ i
* 4));
145 cmp1
= _mm_cmplt_epi32(coeffs
, zbins
[i
!= 0]);
146 cmp2
= _mm_cmpgt_epi32(coeffs
, nzbins
[i
!= 0]);
147 cmp1
= _mm_and_si128(cmp1
, cmp2
);
148 test
= _mm_movemask_epi8(cmp1
);
150 idx_arr
[idx
++] = i
* 4;
152 idx_arr
[idx
++] = i
* 4 + 1;
154 idx_arr
[idx
++] = i
* 4 + 2;
155 if (!(test
& 0xf000))
156 idx_arr
[idx
++] = i
* 4 + 3;
159 // Quantization pass: only process the coefficients selected in
160 // pre-scan pass. Note: idx can be zero.
161 for (i
= 0; i
< idx
; i
++) {
162 const int rc
= idx_arr
[i
];
163 const int coeff
= coeff_ptr
[rc
];
164 const int coeff_sign
= (coeff
>> 31);
165 const int abs_coeff
= (coeff
^ coeff_sign
) - coeff_sign
;
166 const int64_t tmp1
= abs_coeff
167 + ROUND_POWER_OF_TWO(round_ptr
[rc
!= 0], 1);
168 const int64_t tmp2
= ((tmp1
* quant_ptr
[rc
!= 0]) >> 16) + tmp1
;
169 const uint32_t abs_qcoeff
=
170 (uint32_t)((tmp2
* quant_shift_ptr
[rc
!= 0]) >> 15);
171 qcoeff_ptr
[rc
] = (int)(abs_qcoeff
^ coeff_sign
) - coeff_sign
;
172 dqcoeff_ptr
[rc
] = qcoeff_ptr
[rc
] * dequant_ptr
[rc
!= 0] / 2;
174 eob
= iscan
[idx_arr
[i
]] > eob
? iscan
[idx_arr
[i
]] : eob
;