gfx/qcms/transform.c

   1 /* vim: set ts=8 sw=8 noexpandtab: */
   2 //  qcms
   3 //  Copyright (C) 2009 Mozilla Corporation
   4 //  Copyright (C) 1998-2007 Marti Maria
   5 //
   6 // Permission is hereby granted, free of charge, to any person obtaining
   7 // a copy of this software and associated documentation files (the "Software"),
   8 // to deal in the Software without restriction, including without limitation
   9 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10 // and/or sell copies of the Software, and to permit persons to whom the Software
  11 // is furnished to do so, subject to the following conditions:
  12 //
  13 // The above copyright notice and this permission notice shall be included in
  14 // all copies or substantial portions of the Software.
  15 //
  16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  18 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  20 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  22 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23
  24 #include <stdlib.h>
  25 #include <math.h>
  26 #include <assert.h>
  27 #include <string.h> //memcpy
  28 #include "qcmsint.h"
  29 #include "chain.h"
  30 #include "matrix.h"
  31 #include "transform_util.h"
  32
  33 /* for MSVC, GCC, Intel, and Sun compilers */
  34 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
  35 #define X86
  36 #endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
  37
  38 /**
  39  * AltiVec detection for PowerPC CPUs
  40  * In case we have a method of detecting do the runtime detection.
  41  * Otherwise statically choose the AltiVec path in case the compiler
  42  * was told to build with AltiVec support.
  43  */
  44 #if (defined(__POWERPC__) || defined(__powerpc__))
  45 #if defined(__linux__)
  46 #include <unistd.h>
  47 #include <fcntl.h>
  48 #include <stdio.h>
  49 #include <elf.h>
  50 #include <linux/auxvec.h>
  51 #include <asm/cputable.h>
  52 #include <link.h>
  53
  54 static inline qcms_bool have_altivec() {
  55         static int available = -1;
  56         int new_avail = 0;
  57         ElfW(auxv_t) auxv;
  58         ssize_t count;
  59         int fd, i;
  60
  61         if (available != -1)
  62                 return (available != 0 ? true : false);
  63
  64         fd = open("/proc/self/auxv", O_RDONLY);
  65         if (fd < 0)
  66                 goto out;
  67         do {
  68                 count = read(fd, &auxv, sizeof(auxv));
  69                 if (count < 0)
  70                         goto out_close;
  71
  72                 if (auxv.a_type == AT_HWCAP) {
  73                         new_avail = !!(auxv.a_un.a_val & PPC_FEATURE_HAS_ALTIVEC);
  74                         goto out_close;
  75                 }
  76         } while (auxv.a_type != AT_NULL);
  77
  78 out_close:
  79         close(fd);
  80 out:
  81         available = new_avail;
  82         return (available != 0 ? true : false);
  83 }
  84 #elif defined(__APPLE__) && defined(__MACH__)
  85 #include <sys/sysctl.h>
  86
  87 /**
  88  * rip-off from ffmpeg AltiVec detection code.
  89  * this code also appears on Apple's AltiVec pages.
  90  */
  91 static inline qcms_bool have_altivec() {
  92         int sels[2] = {CTL_HW, HW_VECTORUNIT};
  93         static int available = -1;
  94         size_t len = sizeof(available);
  95         int err;
  96
  97         if (available != -1)
  98                 return (available != 0 ? true : false);
  99
 100         err = sysctl(sels, 2, &available, &len, NULL, 0);
 101
 102         if (err == 0)
 103                 if (available != 0)
 104                         return true;
 105
 106         return false;
 107 }
 108 #elif defined(__ALTIVEC__) || defined(__APPLE_ALTIVEC__)
 109 #define have_altivec() true
 110 #else
 111 #define have_altivec() false
 112 #endif
 113 #endif // (defined(__POWERPC__) || defined(__powerpc__))
 114
 115 // Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
 116 // This is just an approximation, I am not handling all the non-linear
 117 // aspects of the RGB to XYZ process, and assumming that the gamma correction
 118 // has transitive property in the tranformation chain.
 119 //
 120 // the alghoritm:
 121 //
 122 //            - First I build the absolute conversion matrix using
 123 //              primaries in XYZ. This matrix is next inverted
 124 //            - Then I eval the source white point across this matrix
 125 //              obtaining the coeficients of the transformation
 126 //            - Then, I apply these coeficients to the original matrix
 127 static struct matrix build_RGB_to_XYZ_transfer_matrix(qcms_CIE_xyY white, qcms_CIE_xyYTRIPLE primrs)
 128 {
 129         struct matrix primaries;
 130         struct matrix primaries_invert;
 131         struct matrix result;
 132         struct vector white_point;
 133         struct vector coefs;
 134
 135         double xn, yn;
 136         double xr, yr;
 137         double xg, yg;
 138         double xb, yb;
 139
 140         xn = white.x;
 141         yn = white.y;
 142
 143         if (yn == 0.0)
 144                 return matrix_invalid();
 145
 146         xr = primrs.red.x;
 147         yr = primrs.red.y;
 148         xg = primrs.green.x;
 149         yg = primrs.green.y;
 150         xb = primrs.blue.x;
 151         yb = primrs.blue.y;
 152
 153         primaries.m[0][0] = xr;
 154         primaries.m[0][1] = xg;
 155         primaries.m[0][2] = xb;
 156
 157         primaries.m[1][0] = yr;
 158         primaries.m[1][1] = yg;
 159         primaries.m[1][2] = yb;
 160
 161         primaries.m[2][0] = 1 - xr - yr;
 162         primaries.m[2][1] = 1 - xg - yg;
 163         primaries.m[2][2] = 1 - xb - yb;
 164         primaries.invalid = false;
 165
 166         white_point.v[0] = xn/yn;
 167         white_point.v[1] = 1.;
 168         white_point.v[2] = (1.0-xn-yn)/yn;
 169
 170         primaries_invert = matrix_invert(primaries);
 171
 172         coefs = matrix_eval(primaries_invert, white_point);
 173
 174         result.m[0][0] = coefs.v[0]*xr;
 175         result.m[0][1] = coefs.v[1]*xg;
 176         result.m[0][2] = coefs.v[2]*xb;
 177
 178         result.m[1][0] = coefs.v[0]*yr;
 179         result.m[1][1] = coefs.v[1]*yg;
 180         result.m[1][2] = coefs.v[2]*yb;
 181
 182         result.m[2][0] = coefs.v[0]*(1.-xr-yr);
 183         result.m[2][1] = coefs.v[1]*(1.-xg-yg);
 184         result.m[2][2] = coefs.v[2]*(1.-xb-yb);
 185         result.invalid = primaries_invert.invalid;
 186
 187         return result;
 188 }
 189
 190 struct CIE_XYZ {
 191         double X;
 192         double Y;
 193         double Z;
 194 };
 195
 196 /* CIE Illuminant D50 */
 197 static const struct CIE_XYZ D50_XYZ = {
 198         0.9642,
 199         1.0000,
 200         0.8249
 201 };
 202
 203 /* from lcms: xyY2XYZ()
 204  * corresponds to argyll: icmYxy2XYZ() */
 205 static struct CIE_XYZ xyY2XYZ(qcms_CIE_xyY source)
 206 {
 207         struct CIE_XYZ dest;
 208         dest.X = (source.x / source.y) * source.Y;
 209         dest.Y = source.Y;
 210         dest.Z = ((1 - source.x - source.y) / source.y) * source.Y;
 211         return dest;
 212 }
 213
 214 /* from lcms: ComputeChromaticAdaption */
 215 // Compute chromatic adaption matrix using chad as cone matrix
 216 static struct matrix
 217 compute_chromatic_adaption(struct CIE_XYZ source_white_point,
 218                            struct CIE_XYZ dest_white_point,
 219                            struct matrix chad)
 220 {
 221         struct matrix chad_inv;
 222         struct vector cone_source_XYZ, cone_source_rgb;
 223         struct vector cone_dest_XYZ, cone_dest_rgb;
 224         struct matrix cone, tmp;
 225
 226         tmp = chad;
 227         chad_inv = matrix_invert(tmp);
 228
 229         cone_source_XYZ.v[0] = source_white_point.X;
 230         cone_source_XYZ.v[1] = source_white_point.Y;
 231         cone_source_XYZ.v[2] = source_white_point.Z;
 232
 233         cone_dest_XYZ.v[0] = dest_white_point.X;
 234         cone_dest_XYZ.v[1] = dest_white_point.Y;
 235         cone_dest_XYZ.v[2] = dest_white_point.Z;
 236
 237         cone_source_rgb = matrix_eval(chad, cone_source_XYZ);
 238         cone_dest_rgb   = matrix_eval(chad, cone_dest_XYZ);
 239
 240         cone.m[0][0] = cone_dest_rgb.v[0]/cone_source_rgb.v[0];
 241         cone.m[0][1] = 0;
 242         cone.m[0][2] = 0;
 243         cone.m[1][0] = 0;
 244         cone.m[1][1] = cone_dest_rgb.v[1]/cone_source_rgb.v[1];
 245         cone.m[1][2] = 0;
 246         cone.m[2][0] = 0;
 247         cone.m[2][1] = 0;
 248         cone.m[2][2] = cone_dest_rgb.v[2]/cone_source_rgb.v[2];
 249         cone.invalid = false;
 250
 251         // Normalize
 252         return matrix_multiply(chad_inv, matrix_multiply(cone, chad));
 253 }
 254
 255 /* from lcms: cmsAdaptionMatrix */
 256 // Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
 257 // Bradford is assumed
 258 static struct matrix
 259 adaption_matrix(struct CIE_XYZ source_illumination, struct CIE_XYZ target_illumination)
 260 {
 261         struct matrix lam_rigg = {{ // Bradford matrix
 262                                  {  0.8951,  0.2664, -0.1614 },
 263                                  { -0.7502,  1.7135,  0.0367 },
 264                                  {  0.0389, -0.0685,  1.0296 }
 265                                  }};
 266         return compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg);
 267 }
 268
 269 /* from lcms: cmsAdaptMatrixToD50 */
 270 static struct matrix adapt_matrix_to_D50(struct matrix r, qcms_CIE_xyY source_white_pt)
 271 {
 272         struct CIE_XYZ Dn;
 273         struct matrix Bradford;
 274
 275         if (source_white_pt.y == 0.0)
 276                 return matrix_invalid();
 277
 278         Dn = xyY2XYZ(source_white_pt);
 279
 280         Bradford = adaption_matrix(Dn, D50_XYZ);
 281         return matrix_multiply(Bradford, r);
 282 }
 283
 284 qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
 285 {
 286         struct matrix colorants;
 287         colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
 288         colorants = adapt_matrix_to_D50(colorants, white_point);
 289
 290         if (colorants.invalid)
 291                 return false;
 292
 293         /* note: there's a transpose type of operation going on here */
 294         profile->redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0]);
 295         profile->redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0]);
 296         profile->redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0]);
 297
 298         profile->greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1]);
 299         profile->greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1]);
 300         profile->greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1]);
 301
 302         profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
 303         profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
 304         profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
 305
 306         return true;
 307 }
 308
 309 qcms_bool get_rgb_colorants(struct matrix *colorants, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
 310 {
 311         *colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
 312         *colorants = adapt_matrix_to_D50(*colorants, white_point);
 313
 314         return (colorants->invalid ? true : false);
 315 }
 316
 317 #if 0
 318 static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 319 {
 320         int i;
 321         float (*mat)[4] = transform->matrix;
 322         for (i=0; i<length; i++) {
 323                 unsigned char device_r = *src++;
 324                 unsigned char device_g = *src++;
 325                 unsigned char device_b = *src++;
 326
 327                 float linear_r = transform->input_gamma_table_r[device_r];
 328                 float linear_g = transform->input_gamma_table_g[device_g];
 329                 float linear_b = transform->input_gamma_table_b[device_b];
 330
 331                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 332                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 333                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 334
 335                 float out_device_r = pow(out_linear_r, transform->out_gamma_r);
 336                 float out_device_g = pow(out_linear_g, transform->out_gamma_g);
 337                 float out_device_b = pow(out_linear_b, transform->out_gamma_b);
 338
 339                 dest[OUTPUT_R_INDEX] = clamp_u8(255*out_device_r);
 340                 dest[OUTPUT_G_INDEX] = clamp_u8(255*out_device_g);
 341                 dest[OUTPUT_B_INDEX] = clamp_u8(255*out_device_b);
 342                 dest += RGB_OUTPUT_COMPONENTS;
 343         }
 344 }
 345 #endif
 346
 347 static void qcms_transform_data_gray_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 348 {
 349         unsigned int i;
 350         for (i = 0; i < length; i++) {
 351                 float out_device_r, out_device_g, out_device_b;
 352                 unsigned char device = *src++;
 353
 354                 float linear = transform->input_gamma_table_gray[device];
 355
 356                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 357                 out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 358                 out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 359
 360                 dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
 361                 dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
 362                 dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
 363                 dest += RGB_OUTPUT_COMPONENTS;
 364         }
 365 }
 366
 367 /* Alpha is not corrected.
 368    A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
 369    RGB Is?" Tech Memo 17 (December 14, 1998).
 370         See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
 371 */
 372
 373 static void qcms_transform_data_graya_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 374 {
 375         unsigned int i;
 376         for (i = 0; i < length; i++) {
 377                 float out_device_r, out_device_g, out_device_b;
 378                 unsigned char device = *src++;
 379                 unsigned char alpha = *src++;
 380
 381                 float linear = transform->input_gamma_table_gray[device];
 382
 383                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 384                 out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 385                 out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 386
 387                 dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
 388                 dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
 389                 dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
 390                 dest[OUTPUT_A_INDEX] = alpha;
 391                 dest += RGBA_OUTPUT_COMPONENTS;
 392         }
 393 }
 394
 395
 396 static void qcms_transform_data_gray_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 397 {
 398         unsigned int i;
 399         for (i = 0; i < length; i++) {
 400                 unsigned char device = *src++;
 401                 uint16_t gray;
 402
 403                 float linear = transform->input_gamma_table_gray[device];
 404
 405                 /* we could round here... */
 406                 gray = linear * PRECACHE_OUTPUT_MAX;
 407
 408                 dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
 409                 dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
 410                 dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
 411                 dest += RGB_OUTPUT_COMPONENTS;
 412         }
 413 }
 414
 415 static void qcms_transform_data_graya_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 416 {
 417         unsigned int i;
 418         for (i = 0; i < length; i++) {
 419                 unsigned char device = *src++;
 420                 unsigned char alpha = *src++;
 421                 uint16_t gray;
 422
 423                 float linear = transform->input_gamma_table_gray[device];
 424
 425                 /* we could round here... */
 426                 gray = linear * PRECACHE_OUTPUT_MAX;
 427
 428                 dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
 429                 dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
 430                 dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
 431                 dest[OUTPUT_A_INDEX] = alpha;
 432                 dest += RGBA_OUTPUT_COMPONENTS;
 433         }
 434 }
 435
 436 static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 437 {
 438         unsigned int i;
 439         float (*mat)[4] = transform->matrix;
 440         for (i = 0; i < length; i++) {
 441                 unsigned char device_r = *src++;
 442                 unsigned char device_g = *src++;
 443                 unsigned char device_b = *src++;
 444                 uint16_t r, g, b;
 445
 446                 float linear_r = transform->input_gamma_table_r[device_r];
 447                 float linear_g = transform->input_gamma_table_g[device_g];
 448                 float linear_b = transform->input_gamma_table_b[device_b];
 449
 450                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 451                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 452                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 453
 454                 out_linear_r = clamp_float(out_linear_r);
 455                 out_linear_g = clamp_float(out_linear_g);
 456                 out_linear_b = clamp_float(out_linear_b);
 457
 458                 /* we could round here... */
 459                 r = out_linear_r * PRECACHE_OUTPUT_MAX;
 460                 g = out_linear_g * PRECACHE_OUTPUT_MAX;
 461                 b = out_linear_b * PRECACHE_OUTPUT_MAX;
 462
 463                 dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
 464                 dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
 465                 dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
 466                 dest += RGB_OUTPUT_COMPONENTS;
 467         }
 468 }
 469
 470 static void qcms_transform_data_rgba_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 471 {
 472         unsigned int i;
 473         float (*mat)[4] = transform->matrix;
 474         for (i = 0; i < length; i++) {
 475                 unsigned char device_r = *src++;
 476                 unsigned char device_g = *src++;
 477                 unsigned char device_b = *src++;
 478                 unsigned char alpha = *src++;
 479                 uint16_t r, g, b;
 480
 481                 float linear_r = transform->input_gamma_table_r[device_r];
 482                 float linear_g = transform->input_gamma_table_g[device_g];
 483                 float linear_b = transform->input_gamma_table_b[device_b];
 484
 485                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 486                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 487                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 488
 489                 out_linear_r = clamp_float(out_linear_r);
 490                 out_linear_g = clamp_float(out_linear_g);
 491                 out_linear_b = clamp_float(out_linear_b);
 492
 493                 /* we could round here... */
 494                 r = out_linear_r * PRECACHE_OUTPUT_MAX;
 495                 g = out_linear_g * PRECACHE_OUTPUT_MAX;
 496                 b = out_linear_b * PRECACHE_OUTPUT_MAX;
 497
 498                 dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
 499                 dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
 500                 dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
 501                 dest[OUTPUT_A_INDEX] = alpha;
 502                 dest += RGBA_OUTPUT_COMPONENTS;
 503         }
 504 }
 505
 506 // Not used
 507 /*
 508 static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
 509         unsigned int i;
 510         int xy_len = 1;
 511         int x_len = transform->grid_size;
 512         int len = x_len * x_len;
 513         float* r_table = transform->r_clut;
 514         float* g_table = transform->g_clut;
 515         float* b_table = transform->b_clut;
 516
 517         for (i = 0; i < length; i++) {
 518                 unsigned char in_r = *src++;
 519                 unsigned char in_g = *src++;
 520                 unsigned char in_b = *src++;
 521                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 522
 523                 int x = floorf(linear_r * (transform->grid_size-1));
 524                 int y = floorf(linear_g * (transform->grid_size-1));
 525                 int z = floorf(linear_b * (transform->grid_size-1));
 526                 int x_n = ceilf(linear_r * (transform->grid_size-1));
 527                 int y_n = ceilf(linear_g * (transform->grid_size-1));
 528                 int z_n = ceilf(linear_b * (transform->grid_size-1));
 529                 float x_d = linear_r * (transform->grid_size-1) - x;
 530                 float y_d = linear_g * (transform->grid_size-1) - y;
 531                 float z_d = linear_b * (transform->grid_size-1) - z;
 532
 533                 float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
 534                 float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
 535                 float r_y1 = lerp(r_x1, r_x2, y_d);
 536                 float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
 537                 float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
 538                 float r_y2 = lerp(r_x3, r_x4, y_d);
 539                 float clut_r = lerp(r_y1, r_y2, z_d);
 540
 541                 float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
 542                 float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
 543                 float g_y1 = lerp(g_x1, g_x2, y_d);
 544                 float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
 545                 float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
 546                 float g_y2 = lerp(g_x3, g_x4, y_d);
 547                 float clut_g = lerp(g_y1, g_y2, z_d);
 548
 549                 float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
 550                 float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
 551                 float b_y1 = lerp(b_x1, b_x2, y_d);
 552                 float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
 553                 float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
 554                 float b_y2 = lerp(b_x3, b_x4, y_d);
 555                 float clut_b = lerp(b_y1, b_y2, z_d);
 556
 557                 *dest++ = clamp_u8(clut_r*255.0f);
 558                 *dest++ = clamp_u8(clut_g*255.0f);
 559                 *dest++ = clamp_u8(clut_b*255.0f);
 560         }
 561 }
 562 */
 563
 564 static int int_div_ceil(int value, int div) {
 565         return ((value  + div - 1) / div);
 566 }
 567
 568 // Using lcms' tetra interpolation algorithm.
 569 static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
 570         unsigned int i;
 571         int xy_len = 1;
 572         int x_len = transform->grid_size;
 573         int len = x_len * x_len;
 574         float* r_table = transform->r_clut;
 575         float* g_table = transform->g_clut;
 576         float* b_table = transform->b_clut;
 577         float c0_r, c1_r, c2_r, c3_r;
 578         float c0_g, c1_g, c2_g, c3_g;
 579         float c0_b, c1_b, c2_b, c3_b;
 580         float clut_r, clut_g, clut_b;
 581         for (i = 0; i < length; i++) {
 582                 unsigned char in_r = *src++;
 583                 unsigned char in_g = *src++;
 584                 unsigned char in_b = *src++;
 585                 unsigned char in_a = *src++;
 586                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 587
 588                 int x = in_r * (transform->grid_size-1) / 255;
 589                 int y = in_g * (transform->grid_size-1) / 255;
 590                 int z = in_b * (transform->grid_size-1) / 255;
 591                 int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
 592                 int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
 593                 int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
 594                 float rx = linear_r * (transform->grid_size-1) - x;
 595                 float ry = linear_g * (transform->grid_size-1) - y;
 596                 float rz = linear_b * (transform->grid_size-1) - z;
 597
 598                 c0_r = CLU(r_table, x, y, z);
 599                 c0_g = CLU(g_table, x, y, z);
 600                 c0_b = CLU(b_table, x, y, z);
 601
 602                 if( rx >= ry ) {
 603                         if (ry >= rz) { //rx >= ry && ry >= rz
 604                                 c1_r = CLU(r_table, x_n, y, z) - c0_r;
 605                                 c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
 606                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 607                                 c1_g = CLU(g_table, x_n, y, z) - c0_g;
 608                                 c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
 609                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 610                                 c1_b = CLU(b_table, x_n, y, z) - c0_b;
 611                                 c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
 612                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 613                         } else {
 614                                 if (rx >= rz) { //rx >= rz && rz >= ry
 615                                         c1_r = CLU(r_table, x_n, y, z) - c0_r;
 616                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 617                                         c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
 618                                         c1_g = CLU(g_table, x_n, y, z) - c0_g;
 619                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 620                                         c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
 621                                         c1_b = CLU(b_table, x_n, y, z) - c0_b;
 622                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 623                                         c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
 624                                 } else { //rz > rx && rx >= ry
 625                                         c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
 626                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 627                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 628                                         c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
 629                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 630                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 631                                         c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
 632                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 633                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 634                                 }
 635                         }
 636                 } else {
 637                         if (rx >= rz) { //ry > rx && rx >= rz
 638                                 c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
 639                                 c2_r = CLU(r_table, x, y_n, z) - c0_r;
 640                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 641                                 c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
 642                                 c2_g = CLU(g_table, x, y_n, z) - c0_g;
 643                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 644                                 c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
 645                                 c2_b = CLU(b_table, x, y_n, z) - c0_b;
 646                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 647                         } else {
 648                                 if (ry >= rz) { //ry >= rz && rz > rx
 649                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 650                                         c2_r = CLU(r_table, x, y_n, z) - c0_r;
 651                                         c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
 652                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 653                                         c2_g = CLU(g_table, x, y_n, z) - c0_g;
 654                                         c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
 655                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 656                                         c2_b = CLU(b_table, x, y_n, z) - c0_b;
 657                                         c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
 658                                 } else { //rz > ry && ry > rx
 659                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 660                                         c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
 661                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 662                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 663                                         c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
 664                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 665                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 666                                         c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
 667                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 668                                 }
 669                         }
 670                 }
 671
 672                 clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
 673                 clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
 674                 clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
 675
 676                 dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
 677                 dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
 678                 dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
 679                 dest[OUTPUT_A_INDEX] = in_a;
 680                 dest += RGBA_OUTPUT_COMPONENTS;
 681         }
 682 }
 683
 684 // Using lcms' tetra interpolation code.
 685 static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
 686         unsigned int i;
 687         int xy_len = 1;
 688         int x_len = transform->grid_size;
 689         int len = x_len * x_len;
 690         float* r_table = transform->r_clut;
 691         float* g_table = transform->g_clut;
 692         float* b_table = transform->b_clut;
 693         float c0_r, c1_r, c2_r, c3_r;
 694         float c0_g, c1_g, c2_g, c3_g;
 695         float c0_b, c1_b, c2_b, c3_b;
 696         float clut_r, clut_g, clut_b;
 697         for (i = 0; i < length; i++) {
 698                 unsigned char in_r = *src++;
 699                 unsigned char in_g = *src++;
 700                 unsigned char in_b = *src++;
 701                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 702
 703                 int x = in_r * (transform->grid_size-1) / 255;
 704                 int y = in_g * (transform->grid_size-1) / 255;
 705                 int z = in_b * (transform->grid_size-1) / 255;
 706                 int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
 707                 int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
 708                 int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
 709                 float rx = linear_r * (transform->grid_size-1) - x;
 710                 float ry = linear_g * (transform->grid_size-1) - y;
 711                 float rz = linear_b * (transform->grid_size-1) - z;
 712
 713                 c0_r = CLU(r_table, x, y, z);
 714                 c0_g = CLU(g_table, x, y, z);
 715                 c0_b = CLU(b_table, x, y, z);
 716
 717                 if( rx >= ry ) {
 718                         if (ry >= rz) { //rx >= ry && ry >= rz
 719                                 c1_r = CLU(r_table, x_n, y, z) - c0_r;
 720                                 c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
 721                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 722                                 c1_g = CLU(g_table, x_n, y, z) - c0_g;
 723                                 c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
 724                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 725                                 c1_b = CLU(b_table, x_n, y, z) - c0_b;
 726                                 c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
 727                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 728                         } else {
 729                                 if (rx >= rz) { //rx >= rz && rz >= ry
 730                                         c1_r = CLU(r_table, x_n, y, z) - c0_r;
 731                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 732                                         c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
 733                                         c1_g = CLU(g_table, x_n, y, z) - c0_g;
 734                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 735                                         c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
 736                                         c1_b = CLU(b_table, x_n, y, z) - c0_b;
 737                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 738                                         c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
 739                                 } else { //rz > rx && rx >= ry
 740                                         c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
 741                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 742                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 743                                         c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
 744                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 745                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 746                                         c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
 747                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 748                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 749                                 }
 750                         }
 751                 } else {
 752                         if (rx >= rz) { //ry > rx && rx >= rz
 753                                 c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
 754                                 c2_r = CLU(r_table, x, y_n, z) - c0_r;
 755                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 756                                 c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
 757                                 c2_g = CLU(g_table, x, y_n, z) - c0_g;
 758                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 759                                 c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
 760                                 c2_b = CLU(b_table, x, y_n, z) - c0_b;
 761                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 762                         } else {
 763                                 if (ry >= rz) { //ry >= rz && rz > rx
 764                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 765                                         c2_r = CLU(r_table, x, y_n, z) - c0_r;
 766                                         c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
 767                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 768                                         c2_g = CLU(g_table, x, y_n, z) - c0_g;
 769                                         c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
 770                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 771                                         c2_b = CLU(b_table, x, y_n, z) - c0_b;
 772                                         c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
 773                                 } else { //rz > ry && ry > rx
 774                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 775                                         c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
 776                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 777                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 778                                         c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
 779                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 780                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 781                                         c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
 782                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 783                                 }
 784                         }
 785                 }
 786
 787                 clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
 788                 clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
 789                 clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
 790
 791                 dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
 792                 dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
 793                 dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
 794                 dest += RGB_OUTPUT_COMPONENTS;
 795         }
 796 }
 797
 798 static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 799 {
 800         unsigned int i;
 801         float (*mat)[4] = transform->matrix;
 802         for (i = 0; i < length; i++) {
 803                 unsigned char device_r = *src++;
 804                 unsigned char device_g = *src++;
 805                 unsigned char device_b = *src++;
 806                 float out_device_r, out_device_g, out_device_b;
 807
 808                 float linear_r = transform->input_gamma_table_r[device_r];
 809                 float linear_g = transform->input_gamma_table_g[device_g];
 810                 float linear_b = transform->input_gamma_table_b[device_b];
 811
 812                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 813                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 814                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 815
 816                 out_linear_r = clamp_float(out_linear_r);
 817                 out_linear_g = clamp_float(out_linear_g);
 818                 out_linear_b = clamp_float(out_linear_b);
 819
 820                 out_device_r = lut_interp_linear(out_linear_r,
 821                                 transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 822                 out_device_g = lut_interp_linear(out_linear_g,
 823                                 transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 824                 out_device_b = lut_interp_linear(out_linear_b,
 825                                 transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 826
 827                 dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
 828                 dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
 829                 dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
 830                 dest += RGB_OUTPUT_COMPONENTS;
 831         }
 832 }
 833
 834 static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 835 {
 836         unsigned int i;
 837         float (*mat)[4] = transform->matrix;
 838         for (i = 0; i < length; i++) {
 839                 unsigned char device_r = *src++;
 840                 unsigned char device_g = *src++;
 841                 unsigned char device_b = *src++;
 842                 unsigned char alpha = *src++;
 843                 float out_device_r, out_device_g, out_device_b;
 844
 845                 float linear_r = transform->input_gamma_table_r[device_r];
 846                 float linear_g = transform->input_gamma_table_g[device_g];
 847                 float linear_b = transform->input_gamma_table_b[device_b];
 848
 849                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 850                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 851                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 852
 853                 out_linear_r = clamp_float(out_linear_r);
 854                 out_linear_g = clamp_float(out_linear_g);
 855                 out_linear_b = clamp_float(out_linear_b);
 856
 857                 out_device_r = lut_interp_linear(out_linear_r,
 858                                 transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 859                 out_device_g = lut_interp_linear(out_linear_g,
 860                                 transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 861                 out_device_b = lut_interp_linear(out_linear_b,
 862                                 transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 863
 864                 dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
 865                 dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
 866                 dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
 867                 dest[OUTPUT_A_INDEX] = alpha;
 868                 dest += RGBA_OUTPUT_COMPONENTS;
 869         }
 870 }
 871
 872 #if 0
 873 static void qcms_transform_data_rgb_out_linear(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 874 {
 875         int i;
 876         float (*mat)[4] = transform->matrix;
 877         for (i = 0; i < length; i++) {
 878                 unsigned char device_r = *src++;
 879                 unsigned char device_g = *src++;
 880                 unsigned char device_b = *src++;
 881
 882                 float linear_r = transform->input_gamma_table_r[device_r];
 883                 float linear_g = transform->input_gamma_table_g[device_g];
 884                 float linear_b = transform->input_gamma_table_b[device_b];
 885
 886                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 887                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 888                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 889
 890                 *dest++ = clamp_u8(out_linear_r*255);
 891                 *dest++ = clamp_u8(out_linear_g*255);
 892                 *dest++ = clamp_u8(out_linear_b*255);
 893         }
 894 }
 895 #endif
 896
 897 /*
 898  * If users create and destroy objects on different threads, even if the same
 899  * objects aren't used on different threads at the same time, we can still run
 900  * in to trouble with refcounts if they aren't atomic.
 901  *
 902  * This can lead to us prematurely deleting the precache if threads get unlucky
 903  * and write the wrong value to the ref count.
 904  */
 905 static struct precache_output *precache_reference(struct precache_output *p)
 906 {
 907         qcms_atomic_increment(p->ref_count);
 908         return p;
 909 }
 910
 911 static struct precache_output *precache_create()
 912 {
 913         struct precache_output *p = malloc(sizeof(struct precache_output));
 914         if (p)
 915                 p->ref_count = 1;
 916         return p;
 917 }
 918
 919 void precache_release(struct precache_output *p)
 920 {
 921         if (qcms_atomic_decrement(p->ref_count) == 0) {
 922                 free(p);
 923         }
 924 }
 925
 926 #ifdef HAVE_POSIX_MEMALIGN
 927 static qcms_transform *transform_alloc(void)
 928 {
 929         qcms_transform *t;
 930
 931         void *allocated_memory;
 932         if (!posix_memalign(&allocated_memory, 16, sizeof(qcms_transform))) {
 933                 /* Doing a memset to initialise all bits to 'zero'*/
 934                 memset(allocated_memory, 0, sizeof(qcms_transform));
 935                 t = allocated_memory;
 936                 return t;
 937         } else {
 938                 return NULL;
 939         }
 940 }
 941 static void transform_free(qcms_transform *t)
 942 {
 943         free(t);
 944 }
 945 #else
 946 static qcms_transform *transform_alloc(void)
 947 {
 948         /* transform needs to be aligned on a 16byte boundrary */
 949         char *original_block = calloc(sizeof(qcms_transform) + sizeof(void*) + 16, 1);
 950         /* make room for a pointer to the block returned by calloc */
 951         void *transform_start = original_block + sizeof(void*);
 952         /* align transform_start */
 953         qcms_transform *transform_aligned = (qcms_transform*)(((uintptr_t)transform_start + 15) & ~0xf);
 954
 955         /* store a pointer to the block returned by calloc so that we can free it later */
 956         void **(original_block_ptr) = (void**)transform_aligned;
 957         if (!original_block)
 958                 return NULL;
 959         original_block_ptr--;
 960         *original_block_ptr = original_block;
 961
 962         return transform_aligned;
 963 }
 964 static void transform_free(qcms_transform *t)
 965 {
 966         /* get at the pointer to the unaligned block returned by calloc */
 967         void **p = (void**)t;
 968         p--;
 969         free(*p);
 970 }
 971 #endif
 972
 973 void qcms_transform_release(qcms_transform *t)
 974 {
 975         /* ensure we only free the gamma tables once even if there are
 976          * multiple references to the same data */
 977
 978         if (t->output_table_r)
 979                 precache_release(t->output_table_r);
 980         if (t->output_table_g)
 981                 precache_release(t->output_table_g);
 982         if (t->output_table_b)
 983                 precache_release(t->output_table_b);
 984
 985         free(t->input_gamma_table_r);
 986         if (t->input_gamma_table_g != t->input_gamma_table_r)
 987                 free(t->input_gamma_table_g);
 988         if (t->input_gamma_table_g != t->input_gamma_table_r &&
 989             t->input_gamma_table_g != t->input_gamma_table_b)
 990                 free(t->input_gamma_table_b);
 991
 992         free(t->input_gamma_table_gray);
 993
 994         free(t->output_gamma_lut_r);
 995         free(t->output_gamma_lut_g);
 996         free(t->output_gamma_lut_b);
 997
 998         transform_free(t);
 999 }
1000
1001 #ifdef X86
1002 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
1003 // mozilla/jpeg)
1004  // -------------------------------------------------------------------------
1005 #if defined(_M_IX86) && defined(_MSC_VER)
1006 #define HAS_CPUID
1007 /* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
1008    register - I'm not sure if that ever happens on windows, but cpuid isn't
1009    on the critical path so we just preserve the register to be safe and to be
1010    consistent with the non-windows version. */
1011 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
1012        uint32_t a_, b_, c_, d_;
1013        __asm {
1014               xchg   ebx, esi
1015               mov    eax, fxn
1016               cpuid
1017               mov    a_, eax
1018               mov    b_, ebx
1019               mov    c_, ecx
1020               mov    d_, edx
1021               xchg   ebx, esi
1022        }
1023        *a = a_;
1024        *b = b_;
1025        *c = c_;
1026        *d = d_;
1027 }
1028 #elif (defined(__GNUC__) || defined(__SUNPRO_C)) && (defined(__i386__) || defined(__i386))
1029 #define HAS_CPUID
1030 /* Get us a CPUID function. We can't use ebx because it's the PIC register on
1031    some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
1032 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
1033
1034         uint32_t a_, b_, c_, d_;
1035        __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;"
1036                              : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
1037            *a = a_;
1038            *b = b_;
1039            *c = c_;
1040            *d = d_;
1041 }
1042 #endif
1043
1044 // -------------------------Runtime SSEx Detection-----------------------------
1045
1046 /* MMX is always supported per
1047  *  Gecko v1.9.1 minimum CPU requirements */
1048 #define SSE1_EDX_MASK (1UL << 25)
1049 #define SSE2_EDX_MASK (1UL << 26)
1050 #define SSE3_ECX_MASK (1UL <<  0)
1051
1052 static int sse_version_available(void)
1053 {
1054 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
1055         /* we know at build time that 64-bit CPUs always have SSE2
1056          * this tells the compiler that non-SSE2 branches will never be
1057          * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
1058         return 2;
1059 #elif defined(HAS_CPUID)
1060         static int sse_version = -1;
1061         uint32_t a, b, c, d;
1062         uint32_t function = 0x00000001;
1063
1064         if (sse_version == -1) {
1065                 sse_version = 0;
1066                 cpuid(function, &a, &b, &c, &d);
1067                 if (c & SSE3_ECX_MASK)
1068                         sse_version = 3;
1069                 else if (d & SSE2_EDX_MASK)
1070                         sse_version = 2;
1071                 else if (d & SSE1_EDX_MASK)
1072                         sse_version = 1;
1073         }
1074
1075         return sse_version;
1076 #else
1077         return 0;
1078 #endif
1079 }
1080 #endif
1081
1082 static const struct matrix bradford_matrix = {{ { 0.8951f, 0.2664f,-0.1614f},
1083                                                 {-0.7502f, 1.7135f, 0.0367f},
1084                                                 { 0.0389f,-0.0685f, 1.0296f}},
1085                                                 false};
1086
1087 static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
1088                                                     { 0.4323053f, 0.5183603f, 0.0492912f},
1089                                                     {-0.0085287f, 0.0400428f, 0.9684867f}},
1090                                                     false};
1091
1092 // See ICCv4 E.3
1093 struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
1094         float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
1095                   (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
1096         float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
1097                   (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
1098         float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
1099                   (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
1100         struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
1101         return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
1102 }
1103
1104 void qcms_profile_precache_output_transform(qcms_profile *profile)
1105 {
1106         /* we only support precaching on rgb profiles */
1107         if (profile->color_space != RGB_SIGNATURE)
1108                 return;
1109
1110         if (qcms_supports_iccv4) {
1111                 /* don't precache since we will use the B2A LUT */
1112                 if (profile->B2A0)
1113                         return;
1114
1115                 /* don't precache since we will use the mBA LUT */
1116                 if (profile->mBA)
1117                         return;
1118         }
1119
1120         /* don't precache if we do not have the TRC curves */
1121         if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
1122                 return;
1123
1124         if (!profile->output_table_r) {
1125                 profile->output_table_r = precache_create();
1126                 if (profile->output_table_r &&
1127                                 !compute_precache(profile->redTRC, profile->output_table_r->data)) {
1128                         precache_release(profile->output_table_r);
1129                         profile->output_table_r = NULL;
1130                 }
1131         }
1132         if (!profile->output_table_g) {
1133                 profile->output_table_g = precache_create();
1134                 if (profile->output_table_g &&
1135                                 !compute_precache(profile->greenTRC, profile->output_table_g->data)) {
1136                         precache_release(profile->output_table_g);
1137                         profile->output_table_g = NULL;
1138                 }
1139         }
1140         if (!profile->output_table_b) {
1141                 profile->output_table_b = precache_create();
1142                 if (profile->output_table_b &&
1143                                 !compute_precache(profile->blueTRC, profile->output_table_b->data)) {
1144                         precache_release(profile->output_table_b);
1145                         profile->output_table_b = NULL;
1146                 }
1147         }
1148 }
1149
1150 /* Replace the current transformation with a LUT transformation using a given number of sample points */
1151 qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out,
1152                                                  int samples, qcms_data_type in_type)
1153 {
1154         /* The range between which 2 consecutive sample points can be used to interpolate */
1155         uint16_t x,y,z;
1156         uint32_t l;
1157         uint32_t lutSize = 3 * samples * samples * samples;
1158         float* src = NULL;
1159         float* dest = NULL;
1160         float* lut = NULL;
1161
1162         src = malloc(lutSize*sizeof(float));
1163         dest = malloc(lutSize*sizeof(float));
1164
1165         if (src && dest) {
1166                 /* Prepare a list of points we want to sample */
1167                 l = 0;
1168                 for (x = 0; x < samples; x++) {
1169                         for (y = 0; y < samples; y++) {
1170                                 for (z = 0; z < samples; z++) {
1171                                         src[l++] = x / (float)(samples-1);
1172                                         src[l++] = y / (float)(samples-1);
1173                                         src[l++] = z / (float)(samples-1);
1174                                 }
1175                         }
1176                 }
1177
1178                 lut = qcms_chain_transform(in, out, src, dest, lutSize);
1179                 if (lut) {
1180                         transform->r_clut = &lut[0];
1181                         transform->g_clut = &lut[1];
1182                         transform->b_clut = &lut[2];
1183                         transform->grid_size = samples;
1184                         if (in_type == QCMS_DATA_RGBA_8) {
1185                                 transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
1186                         } else {
1187                                 transform->transform_fn = qcms_transform_data_tetra_clut;
1188                         }
1189                 }
1190         }
1191
1192
1193         //XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
1194         if (src && lut != src) {
1195                 free(src);
1196         }
1197         if (dest && lut != dest) {
1198                 free(dest);
1199         }
1200
1201         if (lut == NULL) {
1202                 return NULL;
1203         }
1204         return transform;
1205 }
1206
1207 #define NO_MEM_TRANSFORM NULL
1208
1209 qcms_transform* qcms_transform_create(
1210                 qcms_profile *in, qcms_data_type in_type,
1211                 qcms_profile *out, qcms_data_type out_type,
1212                 qcms_intent intent)
1213 {
1214         bool precache = false;
1215
1216         qcms_transform *transform = transform_alloc();
1217         if (!transform) {
1218                 return NULL;
1219         }
1220         if (out_type != QCMS_DATA_RGB_8 &&
1221                 out_type != QCMS_DATA_RGBA_8) {
1222             assert(0 && "output type");
1223             transform_free(transform);
1224             return NULL;
1225         }
1226
1227         if (out->output_table_r &&
1228                         out->output_table_g &&
1229                         out->output_table_b) {
1230                 precache = true;
1231         }
1232
1233         // This precache assumes RGB_SIGNATURE (fails on GRAY_SIGNATURE, for instance)
1234         if (qcms_supports_iccv4 &&
1235                         (in_type == QCMS_DATA_RGB_8 || in_type == QCMS_DATA_RGBA_8) &&
1236                         (in->A2B0 || out->B2A0 || in->mAB || out->mAB))
1237                 {
1238                 // Precache the transformation to a CLUT 33x33x33 in size.
1239                 // 33 is used by many profiles and works well in pratice.
1240                 // This evenly divides 256 into blocks of 8x8x8.
1241                 // TODO For transforming small data sets of about 200x200 or less
1242                 // precaching should be avoided.
1243                 qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
1244                 if (!result) {
1245                         assert(0 && "precacheLUT failed");
1246                         transform_free(transform);
1247                         return NULL;
1248                 }
1249                 return result;
1250         }
1251
1252         if (precache) {
1253                 transform->output_table_r = precache_reference(out->output_table_r);
1254                 transform->output_table_g = precache_reference(out->output_table_g);
1255                 transform->output_table_b = precache_reference(out->output_table_b);
1256         } else {
1257                 if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
1258                         qcms_transform_release(transform);
1259                         return NO_MEM_TRANSFORM;
1260                 }
1261                 build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
1262                 build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
1263                 build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
1264                 if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
1265                         qcms_transform_release(transform);
1266                         return NO_MEM_TRANSFORM;
1267                 }
1268         }
1269
1270         if (in->color_space == RGB_SIGNATURE) {
1271                 struct matrix in_matrix, out_matrix, result;
1272
1273                 if (in_type != QCMS_DATA_RGB_8 &&
1274                     in_type != QCMS_DATA_RGBA_8){
1275                         assert(0 && "input type");
1276                         transform_free(transform);
1277                         return NULL;
1278                 }
1279                 if (precache) {
1280 #ifdef X86
1281                     if (sse_version_available() >= 2) {
1282                             if (in_type == QCMS_DATA_RGB_8)
1283                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
1284                             else
1285                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
1286
1287 #if !(defined(_MSC_VER) && defined(_M_AMD64))
1288                     /* Microsoft Compiler for x64 doesn't support MMX.
1289                      * SSE code uses MMX so that we disable on x64 */
1290                     } else
1291                     if (sse_version_available() >= 1) {
1292                             if (in_type == QCMS_DATA_RGB_8)
1293                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
1294                             else
1295                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
1296 #endif
1297                     } else
1298 #endif
1299 #if (defined(__POWERPC__) || defined(__powerpc__))
1300                     if (have_altivec()) {
1301                             if (in_type == QCMS_DATA_RGB_8)
1302                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_altivec;
1303                             else
1304                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_altivec;
1305                     } else
1306 #endif
1307                         {
1308                                 if (in_type == QCMS_DATA_RGB_8)
1309                                         transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
1310                                 else
1311                                         transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
1312                         }
1313                 } else {
1314                         if (in_type == QCMS_DATA_RGB_8)
1315                                 transform->transform_fn = qcms_transform_data_rgb_out_lut;
1316                         else
1317                                 transform->transform_fn = qcms_transform_data_rgba_out_lut;
1318                 }
1319
1320                 //XXX: avoid duplicating tables if we can
1321                 transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
1322                 transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
1323                 transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
1324                 if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
1325                         qcms_transform_release(transform);
1326                         return NO_MEM_TRANSFORM;
1327                 }
1328
1329
1330                 /* build combined colorant matrix */
1331                 in_matrix = build_colorant_matrix(in);
1332                 out_matrix = build_colorant_matrix(out);
1333                 out_matrix = matrix_invert(out_matrix);
1334                 if (out_matrix.invalid) {
1335                         qcms_transform_release(transform);
1336                         return NULL;
1337                 }
1338                 result = matrix_multiply(out_matrix, in_matrix);
1339
1340                 /* store the results in column major mode
1341                  * this makes doing the multiplication with sse easier */
1342                 transform->matrix[0][0] = result.m[0][0];
1343                 transform->matrix[1][0] = result.m[0][1];
1344                 transform->matrix[2][0] = result.m[0][2];
1345                 transform->matrix[0][1] = result.m[1][0];
1346                 transform->matrix[1][1] = result.m[1][1];
1347                 transform->matrix[2][1] = result.m[1][2];
1348                 transform->matrix[0][2] = result.m[2][0];
1349                 transform->matrix[1][2] = result.m[2][1];
1350                 transform->matrix[2][2] = result.m[2][2];
1351
1352         } else if (in->color_space == GRAY_SIGNATURE) {
1353                 if (in_type != QCMS_DATA_GRAY_8 &&
1354                                 in_type != QCMS_DATA_GRAYA_8){
1355                         assert(0 && "input type");
1356                         transform_free(transform);
1357                         return NULL;
1358                 }
1359
1360                 transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
1361                 if (!transform->input_gamma_table_gray) {
1362                         qcms_transform_release(transform);
1363                         return NO_MEM_TRANSFORM;
1364                 }
1365
1366                 if (precache) {
1367                         if (in_type == QCMS_DATA_GRAY_8) {
1368                                 transform->transform_fn = qcms_transform_data_gray_out_precache;
1369                         } else {
1370                                 transform->transform_fn = qcms_transform_data_graya_out_precache;
1371                         }
1372                 } else {
1373                         if (in_type == QCMS_DATA_GRAY_8) {
1374                                 transform->transform_fn = qcms_transform_data_gray_out_lut;
1375                         } else {
1376                                 transform->transform_fn = qcms_transform_data_graya_out_lut;
1377                         }
1378                 }
1379         } else {
1380                 assert(0 && "unexpected colorspace");
1381                 transform_free(transform);
1382                 return NULL;
1383         }
1384         return transform;
1385 }
1386
1387 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
1388 /* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
1389 __attribute__((__force_align_arg_pointer__))
1390 #endif
1391 void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
1392 {
1393         transform->transform_fn(transform, src, dest, length);
1394 }
1395
1396 qcms_bool qcms_supports_iccv4;
1397 void qcms_enable_iccv4()
1398 {
1399         qcms_supports_iccv4 = true;
1400 }