gfx/qcms/transform.cpp

   1 /* vim: set ts=8 sw=8 noexpandtab: */
   2 //  qcms
   3 //  Copyright (C) 2009 Mozilla Corporation
   4 //  Copyright (C) 1998-2007 Marti Maria
   5 //
   6 // Permission is hereby granted, free of charge, to any person obtaining
   7 // a copy of this software and associated documentation files (the "Software"),
   8 // to deal in the Software without restriction, including without limitation
   9 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10 // and/or sell copies of the Software, and to permit persons to whom the Software
  11 // is furnished to do so, subject to the following conditions:
  12 //
  13 // The above copyright notice and this permission notice shall be included in
  14 // all copies or substantial portions of the Software.
  15 //
  16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  18 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  20 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  22 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23
  24 #include <stdlib.h>
  25 #include <math.h>
  26 #include <assert.h>
  27 #include <string.h> //memcpy
  28 #include "qcmsint.h"
  29 #include "chain.h"
  30 #include "matrix.h"
  31 #include "transform_util.h"
  32
  33 /* for MSVC, GCC, Intel, and Sun compilers */
  34 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
  35 #define X86
  36 #endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
  37
  38 /**
  39  * AltiVec detection for PowerPC CPUs
  40  * In case we have a method of detecting do the runtime detection.
  41  * Otherwise statically choose the AltiVec path in case the compiler
  42  * was told to build with AltiVec support.
  43  */
  44 #if (defined(__POWERPC__) || defined(__powerpc__))
  45 #if defined(__linux__)
  46 #include <unistd.h>
  47 #include <fcntl.h>
  48 #include <stdio.h>
  49 #include <elf.h>
  50 #include <linux/auxvec.h>
  51 #include <asm/cputable.h>
  52 #include <link.h>
  53
  54 static inline bool have_altivec() {
  55         static int available = -1;
  56         int new_avail = 0;
  57         ElfW(auxv_t) auxv;
  58         ssize_t count;
  59         int fd, i;
  60
  61         if (available != -1)
  62                 return (available != 0 ? true : false);
  63
  64         fd = open("/proc/self/auxv", O_RDONLY);
  65         if (fd < 0)
  66                 goto out;
  67         do {
  68                 count = read(fd, &auxv, sizeof(auxv));
  69                 if (count < 0)
  70                         goto out_close;
  71
  72                 if (auxv.a_type == AT_HWCAP) {
  73                         new_avail = !!(auxv.a_un.a_val & PPC_FEATURE_HAS_ALTIVEC);
  74                         goto out_close;
  75                 }
  76         } while (auxv.a_type != AT_NULL);
  77
  78 out_close:
  79         close(fd);
  80 out:
  81         available = new_avail;
  82         return (available != 0 ? true : false);
  83 }
  84 #elif defined(__APPLE__) && defined(__MACH__)
  85 #include <sys/sysctl.h>
  86
  87 /**
  88  * rip-off from ffmpeg AltiVec detection code.
  89  * this code also appears on Apple's AltiVec pages.
  90  */
  91 static inline bool have_altivec() {
  92         int sels[2] = {CTL_HW, HW_VECTORUNIT};
  93         static int available = -1;
  94         size_t len = sizeof(available);
  95         int err;
  96
  97         if (available != -1)
  98                 return (available != 0 ? true : false);
  99
 100         err = sysctl(sels, 2, &available, &len, NULL, 0);
 101
 102         if (err == 0)
 103                 if (available != 0)
 104                         return true;
 105
 106         return false;
 107 }
 108 #elif defined(__ALTIVEC__) || defined(__APPLE_ALTIVEC__)
 109 #define have_altivec() true
 110 #else
 111 #define have_altivec() false
 112 #endif
 113 #endif // (defined(__POWERPC__) || defined(__powerpc__))
 114
 115 // Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
 116 // This is just an approximation, I am not handling all the non-linear
 117 // aspects of the RGB to XYZ process, and assumming that the gamma correction
 118 // has transitive property in the tranformation chain.
 119 //
 120 // the alghoritm:
 121 //
 122 //            - First I build the absolute conversion matrix using
 123 //              primaries in XYZ. This matrix is next inverted
 124 //            - Then I eval the source white point across this matrix
 125 //              obtaining the coeficients of the transformation
 126 //            - Then, I apply these coeficients to the original matrix
 127 static struct matrix build_RGB_to_XYZ_transfer_matrix(qcms_CIE_xyY white, qcms_CIE_xyYTRIPLE primrs)
 128 {
 129         struct matrix primaries;
 130         struct matrix primaries_invert;
 131         struct matrix result;
 132         struct vector white_point;
 133         struct vector coefs;
 134
 135         double xn, yn;
 136         double xr, yr;
 137         double xg, yg;
 138         double xb, yb;
 139
 140         xn = white.x;
 141         yn = white.y;
 142
 143         if (yn == 0.0)
 144                 return matrix_invalid();
 145
 146         xr = primrs.red.x;
 147         yr = primrs.red.y;
 148         xg = primrs.green.x;
 149         yg = primrs.green.y;
 150         xb = primrs.blue.x;
 151         yb = primrs.blue.y;
 152
 153         primaries.m[0][0] = xr;
 154         primaries.m[0][1] = xg;
 155         primaries.m[0][2] = xb;
 156
 157         primaries.m[1][0] = yr;
 158         primaries.m[1][1] = yg;
 159         primaries.m[1][2] = yb;
 160
 161         primaries.m[2][0] = 1 - xr - yr;
 162         primaries.m[2][1] = 1 - xg - yg;
 163         primaries.m[2][2] = 1 - xb - yb;
 164         primaries.invalid = false;
 165
 166         white_point.v[0] = xn/yn;
 167         white_point.v[1] = 1.;
 168         white_point.v[2] = (1.0-xn-yn)/yn;
 169
 170         primaries_invert = matrix_invert(primaries);
 171         if (primaries_invert.invalid) {
 172                 return matrix_invalid();
 173         }
 174
 175         coefs = matrix_eval(primaries_invert, white_point);
 176
 177         result.m[0][0] = coefs.v[0]*xr;
 178         result.m[0][1] = coefs.v[1]*xg;
 179         result.m[0][2] = coefs.v[2]*xb;
 180
 181         result.m[1][0] = coefs.v[0]*yr;
 182         result.m[1][1] = coefs.v[1]*yg;
 183         result.m[1][2] = coefs.v[2]*yb;
 184
 185         result.m[2][0] = coefs.v[0]*(1.-xr-yr);
 186         result.m[2][1] = coefs.v[1]*(1.-xg-yg);
 187         result.m[2][2] = coefs.v[2]*(1.-xb-yb);
 188         result.invalid = primaries_invert.invalid;
 189
 190         return result;
 191 }
 192
 193 struct CIE_XYZ {
 194         double X;
 195         double Y;
 196         double Z;
 197 };
 198
 199 /* CIE Illuminant D50 */
 200 static const struct CIE_XYZ D50_XYZ = {
 201         0.9642,
 202         1.0000,
 203         0.8249
 204 };
 205
 206 /* from lcms: xyY2XYZ()
 207  * corresponds to argyll: icmYxy2XYZ() */
 208 static struct CIE_XYZ xyY2XYZ(qcms_CIE_xyY source)
 209 {
 210         struct CIE_XYZ dest;
 211         dest.X = (source.x / source.y) * source.Y;
 212         dest.Y = source.Y;
 213         dest.Z = ((1 - source.x - source.y) / source.y) * source.Y;
 214         return dest;
 215 }
 216
 217 /* from lcms: ComputeChromaticAdaption */
 218 // Compute chromatic adaption matrix using chad as cone matrix
 219 static struct matrix
 220 compute_chromatic_adaption(struct CIE_XYZ source_white_point,
 221                            struct CIE_XYZ dest_white_point,
 222                            struct matrix chad)
 223 {
 224         struct matrix chad_inv;
 225         struct vector cone_source_XYZ, cone_source_rgb;
 226         struct vector cone_dest_XYZ, cone_dest_rgb;
 227         struct matrix cone, tmp;
 228
 229         tmp = chad;
 230         chad_inv = matrix_invert(tmp);
 231         if (chad_inv.invalid) {
 232                 return matrix_invalid();
 233         }
 234
 235         cone_source_XYZ.v[0] = source_white_point.X;
 236         cone_source_XYZ.v[1] = source_white_point.Y;
 237         cone_source_XYZ.v[2] = source_white_point.Z;
 238
 239         cone_dest_XYZ.v[0] = dest_white_point.X;
 240         cone_dest_XYZ.v[1] = dest_white_point.Y;
 241         cone_dest_XYZ.v[2] = dest_white_point.Z;
 242
 243         cone_source_rgb = matrix_eval(chad, cone_source_XYZ);
 244         cone_dest_rgb   = matrix_eval(chad, cone_dest_XYZ);
 245
 246         cone.m[0][0] = cone_dest_rgb.v[0]/cone_source_rgb.v[0];
 247         cone.m[0][1] = 0;
 248         cone.m[0][2] = 0;
 249         cone.m[1][0] = 0;
 250         cone.m[1][1] = cone_dest_rgb.v[1]/cone_source_rgb.v[1];
 251         cone.m[1][2] = 0;
 252         cone.m[2][0] = 0;
 253         cone.m[2][1] = 0;
 254         cone.m[2][2] = cone_dest_rgb.v[2]/cone_source_rgb.v[2];
 255         cone.invalid = false;
 256
 257         // Normalize
 258         return matrix_multiply(chad_inv, matrix_multiply(cone, chad));
 259 }
 260
 261 /* from lcms: cmsAdaptionMatrix */
 262 // Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
 263 // Bradford is assumed
 264 static struct matrix
 265 adaption_matrix(struct CIE_XYZ source_illumination, struct CIE_XYZ target_illumination)
 266 {
 267         struct matrix lam_rigg = {{ // Bradford matrix
 268                                  {  0.8951f,  0.2664f, -0.1614f },
 269                                  { -0.7502f,  1.7135f,  0.0367f },
 270                                  {  0.0389f, -0.0685f,  1.0296f }
 271                                  }};
 272         return compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg);
 273 }
 274
 275 /* from lcms: cmsAdaptMatrixToD50 */
 276 static struct matrix adapt_matrix_to_D50(struct matrix r, qcms_CIE_xyY source_white_pt)
 277 {
 278         struct CIE_XYZ Dn;
 279         struct matrix Bradford;
 280
 281         if (source_white_pt.y == 0.0) {
 282                 return matrix_invalid();
 283         }
 284
 285         Dn = xyY2XYZ(source_white_pt);
 286
 287         Bradford = adaption_matrix(Dn, D50_XYZ);
 288         if (Bradford.invalid) {
 289                 return matrix_invalid();
 290         }
 291         return matrix_multiply(Bradford, r);
 292 }
 293
 294 bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
 295 {
 296         struct matrix colorants;
 297         colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
 298         colorants = adapt_matrix_to_D50(colorants, white_point);
 299
 300         if (colorants.invalid)
 301                 return false;
 302
 303         /* note: there's a transpose type of operation going on here */
 304         profile->redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0]);
 305         profile->redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0]);
 306         profile->redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0]);
 307
 308         profile->greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1]);
 309         profile->greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1]);
 310         profile->greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1]);
 311
 312         profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
 313         profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
 314         profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
 315
 316         return true;
 317 }
 318
 319 bool get_rgb_colorants(struct matrix *colorants, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
 320 {
 321         *colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
 322         *colorants = adapt_matrix_to_D50(*colorants, white_point);
 323
 324         return (colorants->invalid ? true : false);
 325 }
 326
 327 #if 0
 328 static void qcms_transform_data_rgb_out_pow(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 329 {
 330         int i;
 331         const float (*mat)[4] = transform->matrix;
 332         for (i=0; i<length; i++) {
 333                 unsigned char device_r = *src++;
 334                 unsigned char device_g = *src++;
 335                 unsigned char device_b = *src++;
 336
 337                 float linear_r = transform->input_gamma_table_r[device_r];
 338                 float linear_g = transform->input_gamma_table_g[device_g];
 339                 float linear_b = transform->input_gamma_table_b[device_b];
 340
 341                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 342                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 343                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 344
 345                 float out_device_r = pow(out_linear_r, transform->out_gamma_r);
 346                 float out_device_g = pow(out_linear_g, transform->out_gamma_g);
 347                 float out_device_b = pow(out_linear_b, transform->out_gamma_b);
 348
 349                 dest[OUTPUT_R_INDEX] = clamp_u8(255*out_device_r);
 350                 dest[OUTPUT_G_INDEX] = clamp_u8(255*out_device_g);
 351                 dest[OUTPUT_B_INDEX] = clamp_u8(255*out_device_b);
 352                 dest += RGB_OUTPUT_COMPONENTS;
 353         }
 354 }
 355 #endif
 356
 357 /* Alpha is not corrected.
 358    A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
 359    RGB Is?" Tech Memo 17 (December 14, 1998).
 360         See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
 361 */
 362
 363 template <size_t kRIndex, size_t kGIndex, size_t kBIndex,
 364           size_t kInAIndex = NO_A_INDEX, size_t kOutAIndex = kInAIndex>
 365 static void qcms_transform_data_gray_template_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 366 {
 367         const unsigned int components = A_INDEX_COMPONENTS(kOutAIndex);
 368         unsigned int i;
 369         for (i = 0; i < length; i++) {
 370                 float out_device_r, out_device_g, out_device_b;
 371                 unsigned char device = *src++;
 372                 unsigned char alpha = 0xFF;
 373                 if (kInAIndex != NO_A_INDEX) {
 374                         alpha = *src++;
 375                 }
 376
 377                 float linear = transform->input_gamma_table_gray[device];
 378
 379                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 380                 out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 381                 out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 382
 383                 dest[kRIndex] = clamp_u8(out_device_r*255);
 384                 dest[kGIndex] = clamp_u8(out_device_g*255);
 385                 dest[kBIndex] = clamp_u8(out_device_b*255);
 386                 if (kOutAIndex != NO_A_INDEX) {
 387                         dest[kOutAIndex] = alpha;
 388                 }
 389                 dest += components;
 390         }
 391 }
 392
 393 static void qcms_transform_data_gray_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 394 {
 395         qcms_transform_data_gray_template_lut<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
 396 }
 397
 398 static void qcms_transform_data_gray_rgba_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 399 {
 400         qcms_transform_data_gray_template_lut<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, NO_A_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 401 }
 402
 403 static void qcms_transform_data_gray_bgra_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 404 {
 405         qcms_transform_data_gray_template_lut<BGRA_R_INDEX, BGRA_G_INDEX, BGRA_B_INDEX, NO_A_INDEX, BGRA_A_INDEX>(transform, src, dest, length);
 406 }
 407
 408 static void qcms_transform_data_graya_rgba_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 409 {
 410         qcms_transform_data_gray_template_lut<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 411 }
 412
 413 static void qcms_transform_data_graya_bgra_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 414 {
 415         qcms_transform_data_gray_template_lut<BGRA_R_INDEX, BGRA_G_INDEX, BGRA_B_INDEX, BGRA_A_INDEX>(transform, src, dest, length);
 416 }
 417
 418 template <size_t kRIndex, size_t kGIndex, size_t kBIndex,
 419           size_t kInAIndex = NO_A_INDEX, size_t kOutAIndex = kInAIndex>
 420 static void qcms_transform_data_gray_template_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 421 {
 422         const unsigned int components = A_INDEX_COMPONENTS(kOutAIndex);
 423         unsigned int i;
 424         for (i = 0; i < length; i++) {
 425                 unsigned char device = *src++;
 426                 unsigned char alpha = 0xFF;
 427                 if (kInAIndex != NO_A_INDEX) {
 428                        alpha = *src++;
 429                 }
 430                 uint16_t gray;
 431
 432                 float linear = transform->input_gamma_table_gray[device];
 433
 434                 /* we could round here... */
 435                 gray = linear * PRECACHE_OUTPUT_MAX;
 436
 437                 dest[kRIndex] = transform->output_table_r->data[gray];
 438                 dest[kGIndex] = transform->output_table_g->data[gray];
 439                 dest[kBIndex] = transform->output_table_b->data[gray];
 440                 if (kOutAIndex != NO_A_INDEX) {
 441                         dest[kOutAIndex] = alpha;
 442                 }
 443                 dest += components;
 444         }
 445 }
 446
 447 static void qcms_transform_data_gray_out_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 448 {
 449         qcms_transform_data_gray_template_precache<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
 450 }
 451
 452 static void qcms_transform_data_gray_rgba_out_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 453 {
 454         qcms_transform_data_gray_template_precache<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, NO_A_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 455 }
 456
 457 static void qcms_transform_data_gray_bgra_out_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 458 {
 459         qcms_transform_data_gray_template_precache<BGRA_R_INDEX, BGRA_G_INDEX, BGRA_B_INDEX, NO_A_INDEX, BGRA_A_INDEX>(transform, src, dest, length);
 460 }
 461
 462 static void qcms_transform_data_graya_rgba_out_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 463 {
 464         qcms_transform_data_gray_template_precache<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 465 }
 466
 467 static void qcms_transform_data_graya_bgra_out_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 468 {
 469         qcms_transform_data_gray_template_precache<BGRA_R_INDEX, BGRA_G_INDEX, BGRA_B_INDEX, BGRA_A_INDEX>(transform, src, dest, length);
 470 }
 471
 472 template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
 473 static void qcms_transform_data_template_lut_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 474 {
 475         const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 476         unsigned int i;
 477         const float (*mat)[4] = transform->matrix;
 478         for (i = 0; i < length; i++) {
 479                 unsigned char device_r = src[kRIndex];
 480                 unsigned char device_g = src[kGIndex];
 481                 unsigned char device_b = src[kBIndex];
 482                 unsigned char alpha;
 483                 if (kAIndex != NO_A_INDEX) {
 484                         alpha = src[kAIndex];
 485                 }
 486                 src += components;
 487                 uint16_t r, g, b;
 488
 489                 float linear_r = transform->input_gamma_table_r[device_r];
 490                 float linear_g = transform->input_gamma_table_g[device_g];
 491                 float linear_b = transform->input_gamma_table_b[device_b];
 492
 493                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 494                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 495                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 496
 497                 out_linear_r = clamp_float(out_linear_r);
 498                 out_linear_g = clamp_float(out_linear_g);
 499                 out_linear_b = clamp_float(out_linear_b);
 500
 501                 /* we could round here... */
 502                 r = out_linear_r * PRECACHE_OUTPUT_MAX;
 503                 g = out_linear_g * PRECACHE_OUTPUT_MAX;
 504                 b = out_linear_b * PRECACHE_OUTPUT_MAX;
 505
 506                 dest[kRIndex] = transform->output_table_r->data[r];
 507                 dest[kGIndex] = transform->output_table_g->data[g];
 508                 dest[kBIndex] = transform->output_table_b->data[b];
 509                 if (kAIndex != NO_A_INDEX) {
 510                         dest[kAIndex] = alpha;
 511                 }
 512                 dest += components;
 513         }
 514 }
 515
 516 void qcms_transform_data_rgb_out_lut_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 517 {
 518         qcms_transform_data_template_lut_precache<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
 519 }
 520
 521 void qcms_transform_data_rgba_out_lut_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 522 {
 523         qcms_transform_data_template_lut_precache<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 524 }
 525
 526 void qcms_transform_data_bgra_out_lut_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 527 {
 528         qcms_transform_data_template_lut_precache<BGRA_R_INDEX, BGRA_G_INDEX, BGRA_B_INDEX, BGRA_A_INDEX>(transform, src, dest, length);
 529 }
 530
 531 // Not used
 532 /*
 533 static void qcms_transform_data_clut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
 534         unsigned int i;
 535         int xy_len = 1;
 536         int x_len = transform->grid_size;
 537         int len = x_len * x_len;
 538         const float* r_table = transform->r_clut;
 539         const float* g_table = transform->g_clut;
 540         const float* b_table = transform->b_clut;
 541
 542         for (i = 0; i < length; i++) {
 543                 unsigned char in_r = *src++;
 544                 unsigned char in_g = *src++;
 545                 unsigned char in_b = *src++;
 546                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 547
 548                 int x = floorf(linear_r * (transform->grid_size-1));
 549                 int y = floorf(linear_g * (transform->grid_size-1));
 550                 int z = floorf(linear_b * (transform->grid_size-1));
 551                 int x_n = ceilf(linear_r * (transform->grid_size-1));
 552                 int y_n = ceilf(linear_g * (transform->grid_size-1));
 553                 int z_n = ceilf(linear_b * (transform->grid_size-1));
 554                 float x_d = linear_r * (transform->grid_size-1) - x;
 555                 float y_d = linear_g * (transform->grid_size-1) - y;
 556                 float z_d = linear_b * (transform->grid_size-1) - z;
 557
 558                 float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
 559                 float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
 560                 float r_y1 = lerp(r_x1, r_x2, y_d);
 561                 float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
 562                 float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
 563                 float r_y2 = lerp(r_x3, r_x4, y_d);
 564                 float clut_r = lerp(r_y1, r_y2, z_d);
 565
 566                 float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
 567                 float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
 568                 float g_y1 = lerp(g_x1, g_x2, y_d);
 569                 float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
 570                 float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
 571                 float g_y2 = lerp(g_x3, g_x4, y_d);
 572                 float clut_g = lerp(g_y1, g_y2, z_d);
 573
 574                 float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
 575                 float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
 576                 float b_y1 = lerp(b_x1, b_x2, y_d);
 577                 float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
 578                 float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
 579                 float b_y2 = lerp(b_x3, b_x4, y_d);
 580                 float clut_b = lerp(b_y1, b_y2, z_d);
 581
 582                 *dest++ = clamp_u8(clut_r*255.0f);
 583                 *dest++ = clamp_u8(clut_g*255.0f);
 584                 *dest++ = clamp_u8(clut_b*255.0f);
 585         }
 586 }
 587 */
 588
 589 static int int_div_ceil(int value, int div) {
 590         return ((value  + div - 1) / div);
 591 }
 592
 593 // Using lcms' tetra interpolation algorithm.
 594 template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
 595 static void qcms_transform_data_tetra_clut_template(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
 596         const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 597         unsigned int i;
 598         int xy_len = 1;
 599         int x_len = transform->grid_size;
 600         int len = x_len * x_len;
 601         float* r_table = transform->r_clut;
 602         float* g_table = transform->g_clut;
 603         float* b_table = transform->b_clut;
 604         float c0_r, c1_r, c2_r, c3_r;
 605         float c0_g, c1_g, c2_g, c3_g;
 606         float c0_b, c1_b, c2_b, c3_b;
 607         float clut_r, clut_g, clut_b;
 608         for (i = 0; i < length; i++) {
 609                 unsigned char in_r = src[kRIndex];
 610                 unsigned char in_g = src[kGIndex];
 611                 unsigned char in_b = src[kBIndex];
 612                 unsigned char in_a;
 613                 if (kAIndex != NO_A_INDEX) {
 614                         in_a = src[kAIndex];
 615                 }
 616                 src += components;
 617                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 618
 619                 int x = in_r * (transform->grid_size-1) / 255;
 620                 int y = in_g * (transform->grid_size-1) / 255;
 621                 int z = in_b * (transform->grid_size-1) / 255;
 622                 int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
 623                 int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
 624                 int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
 625                 float rx = linear_r * (transform->grid_size-1) - x;
 626                 float ry = linear_g * (transform->grid_size-1) - y;
 627                 float rz = linear_b * (transform->grid_size-1) - z;
 628
 629                 c0_r = CLU(r_table, x, y, z);
 630                 c0_g = CLU(g_table, x, y, z);
 631                 c0_b = CLU(b_table, x, y, z);
 632
 633                 if( rx >= ry ) {
 634                         if (ry >= rz) { //rx >= ry && ry >= rz
 635                                 c1_r = CLU(r_table, x_n, y, z) - c0_r;
 636                                 c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
 637                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 638                                 c1_g = CLU(g_table, x_n, y, z) - c0_g;
 639                                 c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
 640                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 641                                 c1_b = CLU(b_table, x_n, y, z) - c0_b;
 642                                 c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
 643                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 644                         } else {
 645                                 if (rx >= rz) { //rx >= rz && rz >= ry
 646                                         c1_r = CLU(r_table, x_n, y, z) - c0_r;
 647                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 648                                         c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
 649                                         c1_g = CLU(g_table, x_n, y, z) - c0_g;
 650                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 651                                         c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
 652                                         c1_b = CLU(b_table, x_n, y, z) - c0_b;
 653                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 654                                         c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
 655                                 } else { //rz > rx && rx >= ry
 656                                         c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
 657                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 658                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 659                                         c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
 660                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 661                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 662                                         c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
 663                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 664                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 665                                 }
 666                         }
 667                 } else {
 668                         if (rx >= rz) { //ry > rx && rx >= rz
 669                                 c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
 670                                 c2_r = CLU(r_table, x, y_n, z) - c0_r;
 671                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 672                                 c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
 673                                 c2_g = CLU(g_table, x, y_n, z) - c0_g;
 674                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 675                                 c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
 676                                 c2_b = CLU(b_table, x, y_n, z) - c0_b;
 677                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 678                         } else {
 679                                 if (ry >= rz) { //ry >= rz && rz > rx
 680                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 681                                         c2_r = CLU(r_table, x, y_n, z) - c0_r;
 682                                         c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
 683                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 684                                         c2_g = CLU(g_table, x, y_n, z) - c0_g;
 685                                         c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
 686                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 687                                         c2_b = CLU(b_table, x, y_n, z) - c0_b;
 688                                         c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
 689                                 } else { //rz > ry && ry > rx
 690                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 691                                         c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
 692                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 693                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 694                                         c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
 695                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 696                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 697                                         c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
 698                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 699                                 }
 700                         }
 701                 }
 702
 703                 clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
 704                 clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
 705                 clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
 706
 707                 dest[kRIndex] = clamp_u8(clut_r*255.0f);
 708                 dest[kGIndex] = clamp_u8(clut_g*255.0f);
 709                 dest[kBIndex] = clamp_u8(clut_b*255.0f);
 710                 if (kAIndex != NO_A_INDEX) {
 711                         dest[kAIndex] = in_a;
 712                 }
 713                 dest += components;
 714         }
 715 }
 716
 717 static void qcms_transform_data_tetra_clut_rgb(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
 718         qcms_transform_data_tetra_clut_template<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
 719 }
 720
 721 static void qcms_transform_data_tetra_clut_rgba(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
 722         qcms_transform_data_tetra_clut_template<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 723 }
 724
 725 static void qcms_transform_data_tetra_clut_bgra(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
 726         qcms_transform_data_tetra_clut_template<BGRA_R_INDEX, BGRA_G_INDEX, BGRA_B_INDEX, BGRA_A_INDEX>(transform, src, dest, length);
 727 }
 728
 729 template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
 730 static void qcms_transform_data_template_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 731 {
 732         const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 733         unsigned int i;
 734         const float (*mat)[4] = transform->matrix;
 735         for (i = 0; i < length; i++) {
 736                 unsigned char device_r = src[kRIndex];
 737                 unsigned char device_g = src[kGIndex];
 738                 unsigned char device_b = src[kBIndex];
 739                 unsigned char alpha;
 740                 if (kAIndex != NO_A_INDEX) {
 741                         alpha = src[kAIndex];
 742                 }
 743                 src += components;
 744                 float out_device_r, out_device_g, out_device_b;
 745
 746                 float linear_r = transform->input_gamma_table_r[device_r];
 747                 float linear_g = transform->input_gamma_table_g[device_g];
 748                 float linear_b = transform->input_gamma_table_b[device_b];
 749
 750                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 751                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 752                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 753
 754                 out_linear_r = clamp_float(out_linear_r);
 755                 out_linear_g = clamp_float(out_linear_g);
 756                 out_linear_b = clamp_float(out_linear_b);
 757
 758                 out_device_r = lut_interp_linear(out_linear_r,
 759                                 transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 760                 out_device_g = lut_interp_linear(out_linear_g,
 761                                 transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 762                 out_device_b = lut_interp_linear(out_linear_b,
 763                                 transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 764
 765                 dest[kRIndex] = clamp_u8(out_device_r*255);
 766                 dest[kGIndex] = clamp_u8(out_device_g*255);
 767                 dest[kBIndex] = clamp_u8(out_device_b*255);
 768                 if (kAIndex != NO_A_INDEX) {
 769                         dest[kAIndex] = alpha;
 770                 }
 771                 dest += components;
 772         }
 773 }
 774
 775 void qcms_transform_data_rgb_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 776 {
 777         qcms_transform_data_template_lut<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
 778 }
 779
 780 void qcms_transform_data_rgba_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 781 {
 782         qcms_transform_data_template_lut<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 783 }
 784
 785 void qcms_transform_data_bgra_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 786 {
 787         qcms_transform_data_template_lut<BGRA_R_INDEX, BGRA_G_INDEX, BGRA_B_INDEX, BGRA_A_INDEX>(transform, src, dest, length);
 788 }
 789
 790 #if 0
 791 static void qcms_transform_data_rgb_out_linear(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 792 {
 793         int i;
 794         const float (*mat)[4] = transform->matrix;
 795         for (i = 0; i < length; i++) {
 796                 unsigned char device_r = *src++;
 797                 unsigned char device_g = *src++;
 798                 unsigned char device_b = *src++;
 799
 800                 float linear_r = transform->input_gamma_table_r[device_r];
 801                 float linear_g = transform->input_gamma_table_g[device_g];
 802                 float linear_b = transform->input_gamma_table_b[device_b];
 803
 804                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 805                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 806                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 807
 808                 *dest++ = clamp_u8(out_linear_r*255);
 809                 *dest++ = clamp_u8(out_linear_g*255);
 810                 *dest++ = clamp_u8(out_linear_b*255);
 811         }
 812 }
 813 #endif
 814
 815 /*
 816  * If users create and destroy objects on different threads, even if the same
 817  * objects aren't used on different threads at the same time, we can still run
 818  * in to trouble with refcounts if they aren't atomic.
 819  *
 820  * This can lead to us prematurely deleting the precache if threads get unlucky
 821  * and write the wrong value to the ref count.
 822  */
 823 static struct precache_output *precache_reference(struct precache_output *p)
 824 {
 825         qcms_atomic_increment(p->ref_count);
 826         return p;
 827 }
 828
 829 static struct precache_output *precache_create()
 830 {
 831         struct precache_output *p = (struct precache_output*)malloc(sizeof(struct precache_output));
 832         if (p)
 833                 p->ref_count = 1;
 834         return p;
 835 }
 836
 837 void precache_release(struct precache_output *p)
 838 {
 839         if (qcms_atomic_decrement(p->ref_count) == 0) {
 840                 free(p);
 841         }
 842 }
 843
 844 #ifdef HAVE_POSIX_MEMALIGN
 845 static qcms_transform *transform_alloc(void)
 846 {
 847         qcms_transform *t;
 848
 849         void *allocated_memory;
 850         if (!posix_memalign(&allocated_memory, 16, sizeof(qcms_transform))) {
 851                 /* Doing a memset to initialise all bits to 'zero'*/
 852                 memset(allocated_memory, 0, sizeof(qcms_transform));
 853                 t = (qcms_transform*)allocated_memory;
 854                 return t;
 855         } else {
 856                 return NULL;
 857         }
 858 }
 859 static void transform_free(qcms_transform *t)
 860 {
 861         free(t);
 862 }
 863 #else
 864 static qcms_transform *transform_alloc(void)
 865 {
 866         /* transform needs to be aligned on a 16byte boundrary */
 867         char *original_block = (char *)calloc(sizeof(qcms_transform) + sizeof(void*) + 16, 1);
 868         /* make room for a pointer to the block returned by calloc */
 869         void *transform_start = original_block + sizeof(void*);
 870         /* align transform_start */
 871         qcms_transform *transform_aligned = (qcms_transform*)(((uintptr_t)transform_start + 15) & ~0xf);
 872
 873         /* store a pointer to the block returned by calloc so that we can free it later */
 874         void **(original_block_ptr) = (void**)transform_aligned;
 875         if (!original_block)
 876                 return NULL;
 877         original_block_ptr--;
 878         *original_block_ptr = original_block;
 879
 880         return transform_aligned;
 881 }
 882 static void transform_free(qcms_transform *t)
 883 {
 884         /* get at the pointer to the unaligned block returned by calloc */
 885         void **p = (void**)t;
 886         p--;
 887         free(*p);
 888 }
 889 #endif
 890
 891 void qcms_transform_release(qcms_transform *t)
 892 {
 893         /* ensure we only free the gamma tables once even if there are
 894          * multiple references to the same data */
 895
 896         if (t->output_table_r)
 897                 precache_release(t->output_table_r);
 898         if (t->output_table_g)
 899                 precache_release(t->output_table_g);
 900         if (t->output_table_b)
 901                 precache_release(t->output_table_b);
 902
 903         free(t->input_gamma_table_r);
 904         if (t->input_gamma_table_g != t->input_gamma_table_r)
 905                 free(t->input_gamma_table_g);
 906         if (t->input_gamma_table_g != t->input_gamma_table_r &&
 907             t->input_gamma_table_g != t->input_gamma_table_b)
 908                 free(t->input_gamma_table_b);
 909
 910         free(t->input_gamma_table_gray);
 911
 912         free(t->output_gamma_lut_r);
 913         free(t->output_gamma_lut_g);
 914         free(t->output_gamma_lut_b);
 915
 916         /* r_clut points to beginning of buffer allocated in qcms_transform_precacheLUT_float */
 917         if (t->r_clut)
 918                 free(t->r_clut);
 919
 920         transform_free(t);
 921 }
 922
 923 #ifdef X86
 924 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
 925 // mozilla/jpeg)
 926  // -------------------------------------------------------------------------
 927 #if defined(_M_IX86) && defined(_MSC_VER)
 928 #define HAS_CPUID
 929 /* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
 930    register - I'm not sure if that ever happens on windows, but cpuid isn't
 931    on the critical path so we just preserve the register to be safe and to be
 932    consistent with the non-windows version. */
 933 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
 934        uint32_t a_, b_, c_, d_;
 935        __asm {
 936               xchg   ebx, esi
 937               mov    eax, fxn
 938               cpuid
 939               mov    a_, eax
 940               mov    b_, ebx
 941               mov    c_, ecx
 942               mov    d_, edx
 943               xchg   ebx, esi
 944        }
 945        *a = a_;
 946        *b = b_;
 947        *c = c_;
 948        *d = d_;
 949 }
 950 #elif (defined(__GNUC__) || defined(__SUNPRO_C)) && (defined(__i386__) || defined(__i386))
 951 #define HAS_CPUID
 952 /* Get us a CPUID function. We can't use ebx because it's the PIC register on
 953    some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
 954 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
 955
 956         uint32_t a_, b_, c_, d_;
 957        __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;"
 958                              : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
 959            *a = a_;
 960            *b = b_;
 961            *c = c_;
 962            *d = d_;
 963 }
 964 #endif
 965
 966 // -------------------------Runtime SSEx Detection-----------------------------
 967
 968 /* MMX is always supported per
 969  *  Gecko v1.9.1 minimum CPU requirements */
 970 #define SSE1_EDX_MASK (1UL << 25)
 971 #define SSE2_EDX_MASK (1UL << 26)
 972 #define SSE3_ECX_MASK (1UL <<  0)
 973
 974 static int sse_version_available(void)
 975 {
 976 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
 977         /* we know at build time that 64-bit CPUs always have SSE2
 978          * this tells the compiler that non-SSE2 branches will never be
 979          * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
 980         return 2;
 981 #elif defined(HAS_CPUID)
 982         static int sse_version = -1;
 983         uint32_t a, b, c, d;
 984         uint32_t function = 0x00000001;
 985
 986         if (sse_version == -1) {
 987                 sse_version = 0;
 988                 cpuid(function, &a, &b, &c, &d);
 989                 if (c & SSE3_ECX_MASK)
 990                         sse_version = 3;
 991                 else if (d & SSE2_EDX_MASK)
 992                         sse_version = 2;
 993                 else if (d & SSE1_EDX_MASK)
 994                         sse_version = 1;
 995         }
 996
 997         return sse_version;
 998 #else
 999         return 0;
1000 #endif
1001 }
1002 #endif
1003
1004 static const struct matrix bradford_matrix = {{ { 0.8951f, 0.2664f,-0.1614f},
1005                                                 {-0.7502f, 1.7135f, 0.0367f},
1006                                                 { 0.0389f,-0.0685f, 1.0296f}},
1007                                                 false};
1008
1009 static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
1010                                                     { 0.4323053f, 0.5183603f, 0.0492912f},
1011                                                     {-0.0085287f, 0.0400428f, 0.9684867f}},
1012                                                     false};
1013
1014 // See ICCv4 E.3
1015 struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
1016         float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
1017                   (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
1018         float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
1019                   (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
1020         float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
1021                   (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
1022         struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
1023         return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
1024 }
1025
1026 void qcms_profile_precache_output_transform(qcms_profile *profile)
1027 {
1028         /* we only support precaching on rgb profiles */
1029         if (profile->color_space != RGB_SIGNATURE)
1030                 return;
1031
1032         if (qcms_supports_iccv4) {
1033                 /* don't precache since we will use the B2A LUT */
1034                 if (profile->B2A0)
1035                         return;
1036
1037                 /* don't precache since we will use the mBA LUT */
1038                 if (profile->mBA)
1039                         return;
1040         }
1041
1042         /* don't precache if we do not have the TRC curves */
1043         if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
1044                 return;
1045
1046         if (!profile->output_table_r) {
1047                 profile->output_table_r = precache_create();
1048                 if (profile->output_table_r &&
1049                                 !compute_precache(profile->redTRC, profile->output_table_r->data)) {
1050                         precache_release(profile->output_table_r);
1051                         profile->output_table_r = NULL;
1052                 }
1053         }
1054         if (!profile->output_table_g) {
1055                 profile->output_table_g = precache_create();
1056                 if (profile->output_table_g &&
1057                                 !compute_precache(profile->greenTRC, profile->output_table_g->data)) {
1058                         precache_release(profile->output_table_g);
1059                         profile->output_table_g = NULL;
1060                 }
1061         }
1062         if (!profile->output_table_b) {
1063                 profile->output_table_b = precache_create();
1064                 if (profile->output_table_b &&
1065                                 !compute_precache(profile->blueTRC, profile->output_table_b->data)) {
1066                         precache_release(profile->output_table_b);
1067                         profile->output_table_b = NULL;
1068                 }
1069         }
1070 }
1071
1072 /* Replace the current transformation with a LUT transformation using a given number of sample points */
1073 qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out,
1074                                                  int samples, qcms_data_type in_type)
1075 {
1076         /* The range between which 2 consecutive sample points can be used to interpolate */
1077         uint16_t x,y,z;
1078         uint32_t l;
1079         uint32_t lutSize = 3 * samples * samples * samples;
1080         float* src = NULL;
1081         float* dest = NULL;
1082         float* lut = NULL;
1083
1084         src = (float*)malloc(lutSize*sizeof(float));
1085         dest = (float*)malloc(lutSize*sizeof(float));
1086
1087         if (src && dest) {
1088                 /* Prepare a list of points we want to sample */
1089                 l = 0;
1090                 for (x = 0; x < samples; x++) {
1091                         for (y = 0; y < samples; y++) {
1092                                 for (z = 0; z < samples; z++) {
1093                                         src[l++] = x / (float)(samples-1);
1094                                         src[l++] = y / (float)(samples-1);
1095                                         src[l++] = z / (float)(samples-1);
1096                                 }
1097                         }
1098                 }
1099
1100                 lut = qcms_chain_transform(in, out, src, dest, lutSize);
1101                 if (lut) {
1102                         transform->r_clut = &lut[0];
1103                         transform->g_clut = &lut[1];
1104                         transform->b_clut = &lut[2];
1105                         transform->grid_size = samples;
1106                         if (in_type == QCMS_DATA_RGBA_8) {
1107                                 transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
1108                         } else if (in_type == QCMS_DATA_BGRA_8) {
1109                                 transform->transform_fn = qcms_transform_data_tetra_clut_bgra;
1110                         } else if (in_type == QCMS_DATA_RGB_8) {
1111                                 transform->transform_fn = qcms_transform_data_tetra_clut_rgb;
1112                         }
1113                         assert(transform->transform_fn);
1114                 }
1115         }
1116
1117
1118         //XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
1119         // It will be stored in r_clut, which will be cleaned up in qcms_transform_release.
1120         if (src && lut != src) {
1121                 free(src);
1122         }
1123         if (dest && lut != dest) {
1124                 free(dest);
1125         }
1126
1127         if (lut == NULL) {
1128                 return NULL;
1129         }
1130         return transform;
1131 }
1132
1133 #define NO_MEM_TRANSFORM NULL
1134
1135 qcms_transform* qcms_transform_create(
1136                 qcms_profile *in, qcms_data_type in_type,
1137                 qcms_profile *out, qcms_data_type out_type,
1138                 qcms_intent intent)
1139 {
1140         // Ensure the requested input and output types make sense.
1141         bool match = false;
1142         if (in_type == QCMS_DATA_RGB_8) {
1143                 match = out_type == QCMS_DATA_RGB_8;
1144         } else if (in_type == QCMS_DATA_RGBA_8) {
1145                 match = out_type == QCMS_DATA_RGBA_8;
1146         } else if (in_type == QCMS_DATA_BGRA_8) {
1147                 match = out_type == QCMS_DATA_BGRA_8;
1148         } else if (in_type == QCMS_DATA_GRAY_8) {
1149                 match = out_type == QCMS_DATA_RGB_8 || out_type == QCMS_DATA_RGBA_8 || out_type == QCMS_DATA_BGRA_8;
1150         } else if (in_type == QCMS_DATA_GRAYA_8) {
1151                 match = out_type == QCMS_DATA_RGBA_8 || out_type == QCMS_DATA_BGRA_8;
1152         }
1153         if (!match) {
1154                 assert(0 && "input/output type");
1155                 return NULL;
1156         }
1157
1158         qcms_transform *transform = transform_alloc();
1159         if (!transform) {
1160                 return NULL;
1161         }
1162
1163         bool precache = false;
1164         if (out->output_table_r &&
1165                         out->output_table_g &&
1166                         out->output_table_b) {
1167                 precache = true;
1168         }
1169
1170         // This precache assumes RGB_SIGNATURE (fails on GRAY_SIGNATURE, for instance)
1171         if (qcms_supports_iccv4 &&
1172                         (in_type == QCMS_DATA_RGB_8 || in_type == QCMS_DATA_RGBA_8 || in_type == QCMS_DATA_BGRA_8) &&
1173                         (in->A2B0 || out->B2A0 || in->mAB || out->mAB))
1174                 {
1175                 // Precache the transformation to a CLUT 33x33x33 in size.
1176                 // 33 is used by many profiles and works well in pratice.
1177                 // This evenly divides 256 into blocks of 8x8x8.
1178                 // TODO For transforming small data sets of about 200x200 or less
1179                 // precaching should be avoided.
1180                 qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
1181                 if (!result) {
1182                         assert(0 && "precacheLUT failed");
1183                         qcms_transform_release(transform);
1184                         return NULL;
1185                 }
1186                 return result;
1187         }
1188
1189         if (precache) {
1190                 transform->output_table_r = precache_reference(out->output_table_r);
1191                 transform->output_table_g = precache_reference(out->output_table_g);
1192                 transform->output_table_b = precache_reference(out->output_table_b);
1193         } else {
1194                 if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
1195                         qcms_transform_release(transform);
1196                         return NO_MEM_TRANSFORM;
1197                 }
1198                 build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
1199                 build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
1200                 build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
1201                 if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
1202                         qcms_transform_release(transform);
1203                         return NO_MEM_TRANSFORM;
1204                 }
1205         }
1206
1207         if (in->color_space == RGB_SIGNATURE) {
1208                 struct matrix in_matrix, out_matrix, result;
1209                 if (precache) {
1210 #ifdef X86
1211                     if (sse_version_available() >= 2) {
1212                             if (in_type == QCMS_DATA_RGB_8) {
1213                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
1214                             } else if (in_type == QCMS_DATA_RGBA_8) {
1215                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
1216                             } else if (in_type == QCMS_DATA_BGRA_8) {
1217                                     transform->transform_fn = qcms_transform_data_bgra_out_lut_sse2;
1218                             }
1219
1220 #if !(defined(_MSC_VER) && defined(_M_AMD64))
1221                     /* Microsoft Compiler for x64 doesn't support MMX.
1222                      * SSE code uses MMX so that we disable on x64 */
1223                     } else
1224                     if (sse_version_available() >= 1) {
1225                             if (in_type == QCMS_DATA_RGB_8) {
1226                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
1227                             } else if (in_type == QCMS_DATA_RGBA_8) {
1228                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
1229                             } else if (in_type == QCMS_DATA_BGRA_8) {
1230                                     transform->transform_fn = qcms_transform_data_bgra_out_lut_sse1;
1231                             }
1232 #endif
1233                     } else
1234 #endif
1235 #if defined(__arm__) || defined(__aarch64__)
1236                     if (qcms_supports_neon) {
1237                             if (in_type == QCMS_DATA_RGB_8) {
1238                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_neon;
1239                             } else if (in_type == QCMS_DATA_RGBA_8) {
1240                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_neon;
1241                             } else if (in_type == QCMS_DATA_BGRA_8) {
1242                                     transform->transform_fn = qcms_transform_data_bgra_out_lut_neon;
1243                             }
1244                     } else
1245 #endif
1246 #if (defined(__POWERPC__) || defined(__powerpc__) && !defined(__NO_FPRS__))
1247                     if (have_altivec()) {
1248                             if (in_type == QCMS_DATA_RGB_8) {
1249                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_altivec;
1250                             } else if (in_type == QCMS_DATA_RGBA_8) {
1251                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_altivec;
1252                             } else if (in_type == QCMS_DATA_BGRA_8) {
1253                                     transform->transform_fn = qcms_transform_data_bgra_out_lut_altivec;
1254                             }
1255                     } else
1256 #endif
1257                         {
1258                                 if (in_type == QCMS_DATA_RGB_8) {
1259                                         transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
1260                                 } else if (in_type == QCMS_DATA_RGBA_8) {
1261                                         transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
1262                                 } else if (in_type == QCMS_DATA_BGRA_8) {
1263                                         transform->transform_fn = qcms_transform_data_bgra_out_lut_precache;
1264                                 }
1265                         }
1266                 } else {
1267                         if (in_type == QCMS_DATA_RGB_8) {
1268                                 transform->transform_fn = qcms_transform_data_rgb_out_lut;
1269                         } else if (in_type == QCMS_DATA_RGBA_8) {
1270                                 transform->transform_fn = qcms_transform_data_rgba_out_lut;
1271                         } else if (in_type == QCMS_DATA_BGRA_8) {
1272                                 transform->transform_fn = qcms_transform_data_bgra_out_lut;
1273                         }
1274                 }
1275
1276                 //XXX: avoid duplicating tables if we can
1277                 transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
1278                 transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
1279                 transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
1280                 if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
1281                         qcms_transform_release(transform);
1282                         return NO_MEM_TRANSFORM;
1283                 }
1284
1285
1286                 /* build combined colorant matrix */
1287                 in_matrix = build_colorant_matrix(in);
1288                 out_matrix = build_colorant_matrix(out);
1289                 out_matrix = matrix_invert(out_matrix);
1290                 if (out_matrix.invalid) {
1291                         qcms_transform_release(transform);
1292                         return NULL;
1293                 }
1294                 result = matrix_multiply(out_matrix, in_matrix);
1295
1296                 /* check for NaN values in the matrix and bail if we find any */
1297                 for (unsigned i = 0 ; i < 3 ; ++i) {
1298                         for (unsigned j = 0 ; j < 3 ; ++j) {
1299                                 if (result.m[i][j] != result.m[i][j]) {
1300                                         qcms_transform_release(transform);
1301                                         return NULL;
1302                                 }
1303                         }
1304                 }
1305
1306                 /* store the results in column major mode
1307                  * this makes doing the multiplication with sse easier */
1308                 transform->matrix[0][0] = result.m[0][0];
1309                 transform->matrix[1][0] = result.m[0][1];
1310                 transform->matrix[2][0] = result.m[0][2];
1311                 transform->matrix[0][1] = result.m[1][0];
1312                 transform->matrix[1][1] = result.m[1][1];
1313                 transform->matrix[2][1] = result.m[1][2];
1314                 transform->matrix[0][2] = result.m[2][0];
1315                 transform->matrix[1][2] = result.m[2][1];
1316                 transform->matrix[2][2] = result.m[2][2];
1317
1318         } else if (in->color_space == GRAY_SIGNATURE) {
1319                 transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
1320                 if (!transform->input_gamma_table_gray) {
1321                         qcms_transform_release(transform);
1322                         return NO_MEM_TRANSFORM;
1323                 }
1324
1325                 if (precache) {
1326                         if (out_type == QCMS_DATA_RGB_8) {
1327                                 transform->transform_fn = qcms_transform_data_gray_out_precache;
1328                         } else if (out_type == QCMS_DATA_RGBA_8) {
1329                                 if (in_type == QCMS_DATA_GRAY_8) {
1330                                         transform->transform_fn = qcms_transform_data_gray_rgba_out_precache;
1331                                 } else {
1332                                         transform->transform_fn = qcms_transform_data_graya_rgba_out_precache;
1333                                 }
1334                         } else if (out_type == QCMS_DATA_BGRA_8) {
1335                                 if (in_type == QCMS_DATA_GRAY_8) {
1336                                         transform->transform_fn = qcms_transform_data_gray_bgra_out_precache;
1337                                 } else {
1338                                         transform->transform_fn = qcms_transform_data_graya_bgra_out_precache;
1339                                 }
1340                         }
1341                 } else {
1342                         if (out_type == QCMS_DATA_RGB_8) {
1343                                 transform->transform_fn = qcms_transform_data_gray_out_lut;
1344                         } else if (out_type == QCMS_DATA_RGBA_8) {
1345                                 if (in_type == QCMS_DATA_GRAY_8) {
1346                                         transform->transform_fn = qcms_transform_data_gray_rgba_out_lut;
1347                                 } else {
1348                                         transform->transform_fn = qcms_transform_data_graya_rgba_out_lut;
1349                                 }
1350                         } else if (out_type == QCMS_DATA_BGRA_8) {
1351                                 if (in_type == QCMS_DATA_GRAY_8) {
1352                                         transform->transform_fn = qcms_transform_data_gray_bgra_out_lut;
1353                                 } else {
1354                                         transform->transform_fn = qcms_transform_data_graya_bgra_out_lut;
1355                                 }
1356                         }
1357                 }
1358         } else {
1359                 assert(0 && "unexpected colorspace");
1360                 qcms_transform_release(transform);
1361                 return NULL;
1362         }
1363         assert(transform->transform_fn);
1364         return transform;
1365 }
1366
1367 #if defined(__GNUC__) && defined(__i386__)
1368 /* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
1369 __attribute__((__force_align_arg_pointer__))
1370 #endif
1371 void qcms_transform_data(qcms_transform *transform, const void *src, void *dest, size_t length)
1372 {
1373         transform->transform_fn(transform, (const unsigned char*)src, (unsigned char*)dest, length);
1374 }
1375
1376 bool qcms_supports_iccv4;
1377 void qcms_enable_iccv4()
1378 {
1379         qcms_supports_iccv4 = true;
1380 }
1381
1382 #if defined(__arm__) || defined(__aarch64__)
1383 bool qcms_supports_neon;
1384 #endif
1385 void qcms_enable_neon()
1386 {
1387 #if defined(__arm__) || defined(__aarch64__)
1388         qcms_supports_neon = true;
1389 #endif
1390 }