Bug 1639153 - Part 6.4: Add tls dependency for WasmTruncateToInt64 and Int64ToFloatin...
[gecko.git] / gfx / qcms / transform-avx.cpp
blobff0b5137b61ab6d24a0eb335522e03deefedea59
1 #include <emmintrin.h>
2 #include <immintrin.h>
4 #include "qcmsint.h"
6 template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
7 static void qcms_transform_data_template_lut_avx(const qcms_transform *transform,
8 const unsigned char *src,
9 unsigned char *dest,
10 size_t length)
12 const float (*mat)[4] = transform->matrix;
13 char input_back[64];
14 /* Ensure we have a buffer that's 32 byte aligned regardless of the original
15 * stack alignment. We can't use __attribute__((aligned(32))) or __declspec(align(32))
16 * because they don't work on stack variables. gcc 4.4 does do the right thing
17 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
18 float const * input = (float*)(((uintptr_t)&input_back[32]) & ~0x1f);
19 /* share input and output locations to save having to keep the
20 * locations in separate registers */
21 uint32_t const * output = (uint32_t*)input;
23 /* deref *transform now to avoid it in loop */
24 const float *igtbl_r = transform->input_gamma_table_r;
25 const float *igtbl_g = transform->input_gamma_table_g;
26 const float *igtbl_b = transform->input_gamma_table_b;
28 /* deref *transform now to avoid it in loop */
29 const uint8_t *otdata_r = &transform->output_table_r->data[0];
30 const uint8_t *otdata_g = &transform->output_table_g->data[0];
31 const uint8_t *otdata_b = &transform->output_table_b->data[0];
33 /* input matrix values never change */
34 const __m256 mat0 = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(mat[0]));
35 const __m256 mat1 = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(mat[1]));
36 const __m256 mat2 = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(mat[2]));
38 /* these values don't change, either */
39 const __m256 max = _mm256_set1_ps(CLAMPMAXVAL);
40 const __m256 min = _mm256_setzero_ps();
41 const __m256 scale = _mm256_set1_ps(FLOATSCALE);
42 const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
44 /* working variables */
45 __m256 vec_r, vec_g, vec_b, result;
46 __m128 vec_r0, vec_g0, vec_b0, vec_r1, vec_g1, vec_b1;
47 unsigned char alpha1, alpha2;
49 /* CYA */
50 if (!length)
51 return;
53 /* If there are at least 2 pixels, then we can load their components into
54 a single 256-bit register for processing. */
55 if (length > 1) {
56 vec_r0 = _mm_broadcast_ss(&igtbl_r[src[kRIndex]]);
57 vec_g0 = _mm_broadcast_ss(&igtbl_g[src[kGIndex]]);
58 vec_b0 = _mm_broadcast_ss(&igtbl_b[src[kBIndex]]);
59 vec_r1 = _mm_broadcast_ss(&igtbl_r[src[kRIndex + components]]);
60 vec_g1 = _mm_broadcast_ss(&igtbl_g[src[kGIndex + components]]);
61 vec_b1 = _mm_broadcast_ss(&igtbl_b[src[kBIndex + components]]);
62 vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
63 vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
64 vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
66 if (kAIndex != NO_A_INDEX) {
67 alpha1 = src[kAIndex];
68 alpha2 = src[kAIndex + components];
72 /* If there are at least 4 pixels, then we can iterate and preload the
73 next 2 while we store the result of the current 2. */
74 while (length > 3) {
75 /* Ensure we are pointing at the next 2 pixels for the next load. */
76 src += 2 * components;
78 /* gamma * matrix */
79 vec_r = _mm256_mul_ps(vec_r, mat0);
80 vec_g = _mm256_mul_ps(vec_g, mat1);
81 vec_b = _mm256_mul_ps(vec_b, mat2);
83 /* store alpha for these pixels; load alpha for next two */
84 if (kAIndex != NO_A_INDEX) {
85 dest[kAIndex] = alpha1;
86 dest[kAIndex + components] = alpha2;
87 alpha1 = src[kAIndex];
88 alpha2 = src[kAIndex + components];
91 /* crunch, crunch, crunch */
92 vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
93 vec_r = _mm256_max_ps(min, vec_r);
94 vec_r = _mm256_min_ps(max, vec_r);
95 result = _mm256_mul_ps(vec_r, scale);
97 /* store calc'd output tables indices */
98 _mm256_store_si256((__m256i*)output, _mm256_cvtps_epi32(result));
100 /* load gamma values for next loop while store completes */
101 vec_r0 = _mm_broadcast_ss(&igtbl_r[src[kRIndex]]);
102 vec_g0 = _mm_broadcast_ss(&igtbl_g[src[kGIndex]]);
103 vec_b0 = _mm_broadcast_ss(&igtbl_b[src[kBIndex]]);
104 vec_r1 = _mm_broadcast_ss(&igtbl_r[src[kRIndex + components]]);
105 vec_g1 = _mm_broadcast_ss(&igtbl_g[src[kGIndex + components]]);
106 vec_b1 = _mm_broadcast_ss(&igtbl_b[src[kBIndex + components]]);
107 vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
108 vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
109 vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
111 /* use calc'd indices to output RGB values */
112 dest[kRIndex] = otdata_r[output[0]];
113 dest[kGIndex] = otdata_g[output[1]];
114 dest[kBIndex] = otdata_b[output[2]];
115 dest[kRIndex + components] = otdata_r[output[4]];
116 dest[kGIndex + components] = otdata_g[output[5]];
117 dest[kBIndex + components] = otdata_b[output[6]];
119 dest += 2 * components;
120 length -= 2;
123 /* There are 0-3 pixels remaining. If there are 2-3 remaining, then we know
124 we have already populated the necessary registers to start the transform. */
125 if (length > 1) {
126 vec_r = _mm256_mul_ps(vec_r, mat0);
127 vec_g = _mm256_mul_ps(vec_g, mat1);
128 vec_b = _mm256_mul_ps(vec_b, mat2);
130 if (kAIndex != NO_A_INDEX) {
131 dest[kAIndex] = alpha1;
132 dest[kAIndex + components] = alpha2;
135 vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
136 vec_r = _mm256_max_ps(min, vec_r);
137 vec_r = _mm256_min_ps(max, vec_r);
138 result = _mm256_mul_ps(vec_r, scale);
140 _mm256_store_si256((__m256i*)output, _mm256_cvtps_epi32(result));
142 dest[kRIndex] = otdata_r[output[0]];
143 dest[kGIndex] = otdata_g[output[1]];
144 dest[kBIndex] = otdata_b[output[2]];
145 dest[kRIndex + components] = otdata_r[output[4]];
146 dest[kGIndex + components] = otdata_g[output[5]];
147 dest[kBIndex + components] = otdata_b[output[6]];
149 src += 2 * components;
150 dest += 2 * components;
151 length -= 2;
154 /* There may be 0-1 pixels remaining. */
155 if (length == 1) {
156 vec_r0 = _mm_broadcast_ss(&igtbl_r[src[kRIndex]]);
157 vec_g0 = _mm_broadcast_ss(&igtbl_g[src[kGIndex]]);
158 vec_b0 = _mm_broadcast_ss(&igtbl_b[src[kBIndex]]);
160 vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(mat0));
161 vec_g0 = _mm_mul_ps(vec_g0, _mm256_castps256_ps128(mat1));
162 vec_b0 = _mm_mul_ps(vec_b0, _mm256_castps256_ps128(mat2));
164 if (kAIndex != NO_A_INDEX) {
165 dest[kAIndex] = src[kAIndex];
168 vec_r0 = _mm_add_ps(vec_r0, _mm_add_ps(vec_g0, vec_b0));
169 vec_r0 = _mm_max_ps(_mm256_castps256_ps128(min), vec_r0);
170 vec_r0 = _mm_min_ps(_mm256_castps256_ps128(max), vec_r0);
171 vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(scale));
173 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(vec_r0));
175 dest[kRIndex] = otdata_r[output[0]];
176 dest[kGIndex] = otdata_g[output[1]];
177 dest[kBIndex] = otdata_b[output[2]];
181 void qcms_transform_data_rgb_out_lut_avx(const qcms_transform *transform,
182 const unsigned char *src,
183 unsigned char *dest,
184 size_t length)
186 qcms_transform_data_template_lut_avx<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
189 void qcms_transform_data_rgba_out_lut_avx(const qcms_transform *transform,
190 const unsigned char *src,
191 unsigned char *dest,
192 size_t length)
194 qcms_transform_data_template_lut_avx<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
197 void qcms_transform_data_bgra_out_lut_avx(const qcms_transform *transform,
198 const unsigned char *src,
199 unsigned char *dest,
200 size_t length)
202 qcms_transform_data_template_lut_avx<BGRA_R_INDEX, BGRA_G_INDEX, BGRA_B_INDEX, BGRA_A_INDEX>(transform, src, dest, length);