6 template <size_t kRIndex
, size_t kGIndex
, size_t kBIndex
, size_t kAIndex
= NO_A_INDEX
>
7 static void qcms_transform_data_template_lut_avx(const qcms_transform
*transform
,
8 const unsigned char *src
,
12 const float (*mat
)[4] = transform
->matrix
;
14 /* Ensure we have a buffer that's 32 byte aligned regardless of the original
15 * stack alignment. We can't use __attribute__((aligned(32))) or __declspec(align(32))
16 * because they don't work on stack variables. gcc 4.4 does do the right thing
17 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
18 float const * input
= (float*)(((uintptr_t)&input_back
[32]) & ~0x1f);
19 /* share input and output locations to save having to keep the
20 * locations in separate registers */
21 uint32_t const * output
= (uint32_t*)input
;
23 /* deref *transform now to avoid it in loop */
24 const float *igtbl_r
= transform
->input_gamma_table_r
;
25 const float *igtbl_g
= transform
->input_gamma_table_g
;
26 const float *igtbl_b
= transform
->input_gamma_table_b
;
28 /* deref *transform now to avoid it in loop */
29 const uint8_t *otdata_r
= &transform
->output_table_r
->data
[0];
30 const uint8_t *otdata_g
= &transform
->output_table_g
->data
[0];
31 const uint8_t *otdata_b
= &transform
->output_table_b
->data
[0];
33 /* input matrix values never change */
34 const __m256 mat0
= _mm256_broadcast_ps(reinterpret_cast<const __m128
*>(mat
[0]));
35 const __m256 mat1
= _mm256_broadcast_ps(reinterpret_cast<const __m128
*>(mat
[1]));
36 const __m256 mat2
= _mm256_broadcast_ps(reinterpret_cast<const __m128
*>(mat
[2]));
38 /* these values don't change, either */
39 const __m256 max
= _mm256_set1_ps(CLAMPMAXVAL
);
40 const __m256 min
= _mm256_setzero_ps();
41 const __m256 scale
= _mm256_set1_ps(FLOATSCALE
);
42 const unsigned int components
= A_INDEX_COMPONENTS(kAIndex
);
44 /* working variables */
45 __m256 vec_r
, vec_g
, vec_b
, result
;
46 __m128 vec_r0
, vec_g0
, vec_b0
, vec_r1
, vec_g1
, vec_b1
;
47 unsigned char alpha1
, alpha2
;
53 /* If there are at least 2 pixels, then we can load their components into
54 a single 256-bit register for processing. */
56 vec_r0
= _mm_broadcast_ss(&igtbl_r
[src
[kRIndex
]]);
57 vec_g0
= _mm_broadcast_ss(&igtbl_g
[src
[kGIndex
]]);
58 vec_b0
= _mm_broadcast_ss(&igtbl_b
[src
[kBIndex
]]);
59 vec_r1
= _mm_broadcast_ss(&igtbl_r
[src
[kRIndex
+ components
]]);
60 vec_g1
= _mm_broadcast_ss(&igtbl_g
[src
[kGIndex
+ components
]]);
61 vec_b1
= _mm_broadcast_ss(&igtbl_b
[src
[kBIndex
+ components
]]);
62 vec_r
= _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0
), vec_r1
, 1);
63 vec_g
= _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0
), vec_g1
, 1);
64 vec_b
= _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0
), vec_b1
, 1);
66 if (kAIndex
!= NO_A_INDEX
) {
67 alpha1
= src
[kAIndex
];
68 alpha2
= src
[kAIndex
+ components
];
72 /* If there are at least 4 pixels, then we can iterate and preload the
73 next 2 while we store the result of the current 2. */
75 /* Ensure we are pointing at the next 2 pixels for the next load. */
76 src
+= 2 * components
;
79 vec_r
= _mm256_mul_ps(vec_r
, mat0
);
80 vec_g
= _mm256_mul_ps(vec_g
, mat1
);
81 vec_b
= _mm256_mul_ps(vec_b
, mat2
);
83 /* store alpha for these pixels; load alpha for next two */
84 if (kAIndex
!= NO_A_INDEX
) {
85 dest
[kAIndex
] = alpha1
;
86 dest
[kAIndex
+ components
] = alpha2
;
87 alpha1
= src
[kAIndex
];
88 alpha2
= src
[kAIndex
+ components
];
91 /* crunch, crunch, crunch */
92 vec_r
= _mm256_add_ps(vec_r
, _mm256_add_ps(vec_g
, vec_b
));
93 vec_r
= _mm256_max_ps(min
, vec_r
);
94 vec_r
= _mm256_min_ps(max
, vec_r
);
95 result
= _mm256_mul_ps(vec_r
, scale
);
97 /* store calc'd output tables indices */
98 _mm256_store_si256((__m256i
*)output
, _mm256_cvtps_epi32(result
));
100 /* load gamma values for next loop while store completes */
101 vec_r0
= _mm_broadcast_ss(&igtbl_r
[src
[kRIndex
]]);
102 vec_g0
= _mm_broadcast_ss(&igtbl_g
[src
[kGIndex
]]);
103 vec_b0
= _mm_broadcast_ss(&igtbl_b
[src
[kBIndex
]]);
104 vec_r1
= _mm_broadcast_ss(&igtbl_r
[src
[kRIndex
+ components
]]);
105 vec_g1
= _mm_broadcast_ss(&igtbl_g
[src
[kGIndex
+ components
]]);
106 vec_b1
= _mm_broadcast_ss(&igtbl_b
[src
[kBIndex
+ components
]]);
107 vec_r
= _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0
), vec_r1
, 1);
108 vec_g
= _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0
), vec_g1
, 1);
109 vec_b
= _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0
), vec_b1
, 1);
111 /* use calc'd indices to output RGB values */
112 dest
[kRIndex
] = otdata_r
[output
[0]];
113 dest
[kGIndex
] = otdata_g
[output
[1]];
114 dest
[kBIndex
] = otdata_b
[output
[2]];
115 dest
[kRIndex
+ components
] = otdata_r
[output
[4]];
116 dest
[kGIndex
+ components
] = otdata_g
[output
[5]];
117 dest
[kBIndex
+ components
] = otdata_b
[output
[6]];
119 dest
+= 2 * components
;
123 /* There are 0-3 pixels remaining. If there are 2-3 remaining, then we know
124 we have already populated the necessary registers to start the transform. */
126 vec_r
= _mm256_mul_ps(vec_r
, mat0
);
127 vec_g
= _mm256_mul_ps(vec_g
, mat1
);
128 vec_b
= _mm256_mul_ps(vec_b
, mat2
);
130 if (kAIndex
!= NO_A_INDEX
) {
131 dest
[kAIndex
] = alpha1
;
132 dest
[kAIndex
+ components
] = alpha2
;
135 vec_r
= _mm256_add_ps(vec_r
, _mm256_add_ps(vec_g
, vec_b
));
136 vec_r
= _mm256_max_ps(min
, vec_r
);
137 vec_r
= _mm256_min_ps(max
, vec_r
);
138 result
= _mm256_mul_ps(vec_r
, scale
);
140 _mm256_store_si256((__m256i
*)output
, _mm256_cvtps_epi32(result
));
142 dest
[kRIndex
] = otdata_r
[output
[0]];
143 dest
[kGIndex
] = otdata_g
[output
[1]];
144 dest
[kBIndex
] = otdata_b
[output
[2]];
145 dest
[kRIndex
+ components
] = otdata_r
[output
[4]];
146 dest
[kGIndex
+ components
] = otdata_g
[output
[5]];
147 dest
[kBIndex
+ components
] = otdata_b
[output
[6]];
149 src
+= 2 * components
;
150 dest
+= 2 * components
;
154 /* There may be 0-1 pixels remaining. */
156 vec_r0
= _mm_broadcast_ss(&igtbl_r
[src
[kRIndex
]]);
157 vec_g0
= _mm_broadcast_ss(&igtbl_g
[src
[kGIndex
]]);
158 vec_b0
= _mm_broadcast_ss(&igtbl_b
[src
[kBIndex
]]);
160 vec_r0
= _mm_mul_ps(vec_r0
, _mm256_castps256_ps128(mat0
));
161 vec_g0
= _mm_mul_ps(vec_g0
, _mm256_castps256_ps128(mat1
));
162 vec_b0
= _mm_mul_ps(vec_b0
, _mm256_castps256_ps128(mat2
));
164 if (kAIndex
!= NO_A_INDEX
) {
165 dest
[kAIndex
] = src
[kAIndex
];
168 vec_r0
= _mm_add_ps(vec_r0
, _mm_add_ps(vec_g0
, vec_b0
));
169 vec_r0
= _mm_max_ps(_mm256_castps256_ps128(min
), vec_r0
);
170 vec_r0
= _mm_min_ps(_mm256_castps256_ps128(max
), vec_r0
);
171 vec_r0
= _mm_mul_ps(vec_r0
, _mm256_castps256_ps128(scale
));
173 _mm_store_si128((__m128i
*)output
, _mm_cvtps_epi32(vec_r0
));
175 dest
[kRIndex
] = otdata_r
[output
[0]];
176 dest
[kGIndex
] = otdata_g
[output
[1]];
177 dest
[kBIndex
] = otdata_b
[output
[2]];
181 void qcms_transform_data_rgb_out_lut_avx(const qcms_transform
*transform
,
182 const unsigned char *src
,
186 qcms_transform_data_template_lut_avx
<RGBA_R_INDEX
, RGBA_G_INDEX
, RGBA_B_INDEX
>(transform
, src
, dest
, length
);
189 void qcms_transform_data_rgba_out_lut_avx(const qcms_transform
*transform
,
190 const unsigned char *src
,
194 qcms_transform_data_template_lut_avx
<RGBA_R_INDEX
, RGBA_G_INDEX
, RGBA_B_INDEX
, RGBA_A_INDEX
>(transform
, src
, dest
, length
);
197 void qcms_transform_data_bgra_out_lut_avx(const qcms_transform
*transform
,
198 const unsigned char *src
,
202 qcms_transform_data_template_lut_avx
<BGRA_R_INDEX
, BGRA_G_INDEX
, BGRA_B_INDEX
, BGRA_A_INDEX
>(transform
, src
, dest
, length
);