5 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
6 static const ALIGN
float floatScaleX4
[4] =
7 { FLOATSCALE
, FLOATSCALE
, FLOATSCALE
, FLOATSCALE
};
8 static const ALIGN
float clampMaxValueX4
[4] =
9 { CLAMPMAXVAL
, CLAMPMAXVAL
, CLAMPMAXVAL
, CLAMPMAXVAL
};
11 template <size_t kRIndex
, size_t kGIndex
, size_t kBIndex
, size_t kAIndex
= NO_A_INDEX
>
12 static void qcms_transform_data_template_lut_sse1(const qcms_transform
*transform
,
13 const unsigned char *src
,
18 const float (*mat
)[4] = transform
->matrix
;
20 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
21 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
22 * because they don't work on stack variables. gcc 4.4 does do the right thing
23 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
24 float const * input
= (float*)(((uintptr_t)&input_back
[16]) & ~0xf);
25 /* share input and output locations to save having to keep the
26 * locations in separate registers */
27 uint32_t const * output
= (uint32_t*)input
;
29 /* deref *transform now to avoid it in loop */
30 const float *igtbl_r
= transform
->input_gamma_table_r
;
31 const float *igtbl_g
= transform
->input_gamma_table_g
;
32 const float *igtbl_b
= transform
->input_gamma_table_b
;
34 /* deref *transform now to avoid it in loop */
35 const uint8_t *otdata_r
= &transform
->output_table_r
->data
[0];
36 const uint8_t *otdata_g
= &transform
->output_table_g
->data
[0];
37 const uint8_t *otdata_b
= &transform
->output_table_b
->data
[0];
39 /* input matrix values never change */
40 const __m128 mat0
= _mm_load_ps(mat
[0]);
41 const __m128 mat1
= _mm_load_ps(mat
[1]);
42 const __m128 mat2
= _mm_load_ps(mat
[2]);
44 /* these values don't change, either */
45 const __m128 max
= _mm_load_ps(clampMaxValueX4
);
46 const __m128 min
= _mm_setzero_ps();
47 const __m128 scale
= _mm_load_ps(floatScaleX4
);
48 const unsigned int components
= A_INDEX_COMPONENTS(kAIndex
);
50 /* working variables */
51 __m128 vec_r
, vec_g
, vec_b
, result
;
58 /* one pixel is handled outside of the loop */
61 /* setup for transforming 1st pixel */
62 vec_r
= _mm_load_ss(&igtbl_r
[src
[kRIndex
]]);
63 vec_g
= _mm_load_ss(&igtbl_g
[src
[kGIndex
]]);
64 vec_b
= _mm_load_ss(&igtbl_b
[src
[kBIndex
]]);
65 if (kAIndex
!= NO_A_INDEX
) {
70 /* transform all but final pixel */
72 for (i
=0; i
<length
; i
++)
74 /* position values from gamma tables */
75 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
76 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
77 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
80 vec_r
= _mm_mul_ps(vec_r
, mat0
);
81 vec_g
= _mm_mul_ps(vec_g
, mat1
);
82 vec_b
= _mm_mul_ps(vec_b
, mat2
);
84 /* store alpha for this pixel; load alpha for next */
85 if (kAIndex
!= NO_A_INDEX
) {
86 dest
[kAIndex
] = alpha
;
90 /* crunch, crunch, crunch */
91 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
92 vec_r
= _mm_max_ps(min
, vec_r
);
93 vec_r
= _mm_min_ps(max
, vec_r
);
94 result
= _mm_mul_ps(vec_r
, scale
);
96 /* store calc'd output tables indices */
97 *((__m64
*)&output
[0]) = _mm_cvtps_pi32(result
);
98 result
= _mm_movehl_ps(result
, result
);
99 *((__m64
*)&output
[2]) = _mm_cvtps_pi32(result
);
101 /* load gamma values for next loop while store completes */
102 vec_r
= _mm_load_ss(&igtbl_r
[src
[kRIndex
]]);
103 vec_g
= _mm_load_ss(&igtbl_g
[src
[kGIndex
]]);
104 vec_b
= _mm_load_ss(&igtbl_b
[src
[kBIndex
]]);
107 /* use calc'd indices to output RGB values */
108 dest
[kRIndex
] = otdata_r
[output
[0]];
109 dest
[kGIndex
] = otdata_g
[output
[1]];
110 dest
[kBIndex
] = otdata_b
[output
[2]];
114 /* handle final (maybe only) pixel */
116 vec_r
= _mm_shuffle_ps(vec_r
, vec_r
, 0);
117 vec_g
= _mm_shuffle_ps(vec_g
, vec_g
, 0);
118 vec_b
= _mm_shuffle_ps(vec_b
, vec_b
, 0);
120 vec_r
= _mm_mul_ps(vec_r
, mat0
);
121 vec_g
= _mm_mul_ps(vec_g
, mat1
);
122 vec_b
= _mm_mul_ps(vec_b
, mat2
);
124 if (kAIndex
!= NO_A_INDEX
) {
125 dest
[kAIndex
] = alpha
;
128 vec_r
= _mm_add_ps(vec_r
, _mm_add_ps(vec_g
, vec_b
));
129 vec_r
= _mm_max_ps(min
, vec_r
);
130 vec_r
= _mm_min_ps(max
, vec_r
);
131 result
= _mm_mul_ps(vec_r
, scale
);
133 *((__m64
*)&output
[0]) = _mm_cvtps_pi32(result
);
134 result
= _mm_movehl_ps(result
, result
);
135 *((__m64
*)&output
[2]) = _mm_cvtps_pi32(result
);
137 dest
[kRIndex
] = otdata_r
[output
[0]];
138 dest
[kGIndex
] = otdata_g
[output
[1]];
139 dest
[kBIndex
] = otdata_b
[output
[2]];
144 void qcms_transform_data_rgb_out_lut_sse1(const qcms_transform
*transform
,
145 const unsigned char *src
,
149 qcms_transform_data_template_lut_sse1
<RGBA_R_INDEX
, RGBA_G_INDEX
, RGBA_B_INDEX
>(transform
, src
, dest
, length
);
152 void qcms_transform_data_rgba_out_lut_sse1(const qcms_transform
*transform
,
153 const unsigned char *src
,
157 qcms_transform_data_template_lut_sse1
<RGBA_R_INDEX
, RGBA_G_INDEX
, RGBA_B_INDEX
, RGBA_A_INDEX
>(transform
, src
, dest
, length
);
160 void qcms_transform_data_bgra_out_lut_sse1(const qcms_transform
*transform
,
161 const unsigned char *src
,
165 qcms_transform_data_template_lut_sse1
<BGRA_R_INDEX
, BGRA_G_INDEX
, BGRA_B_INDEX
, BGRA_A_INDEX
>(transform
, src
, dest
, length
);