gfx/qcms/transform.c

   1 /* vim: set ts=8 sw=8 noexpandtab: */
   2 //  qcms
   3 //  Copyright (C) 2009 Mozilla Corporation
   4 //  Copyright (C) 1998-2007 Marti Maria
   5 //
   6 // Permission is hereby granted, free of charge, to any person obtaining
   7 // a copy of this software and associated documentation files (the "Software"),
   8 // to deal in the Software without restriction, including without limitation
   9 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10 // and/or sell copies of the Software, and to permit persons to whom the Software
  11 // is furnished to do so, subject to the following conditions:
  12 //
  13 // The above copyright notice and this permission notice shall be included in
  14 // all copies or substantial portions of the Software.
  15 //
  16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  18 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  20 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  22 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23
  24 #include <stdlib.h>
  25 #include <math.h>
  26 #include <assert.h>
  27 #include <string.h> //memcpy
  28 #include "qcmsint.h"
  29 #include "chain.h"
  30 #include "matrix.h"
  31 #include "transform_util.h"
  32
  33 /* for MSVC, GCC, Intel, and Sun compilers */
  34 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
  35 #define X86
  36 #endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
  37
  38 /**
  39  * AltiVec detection for PowerPC CPUs
  40  * In case we have a method of detecting do the runtime detection.
  41  * Otherwise statically choose the AltiVec path in case the compiler
  42  * was told to build with AltiVec support.
  43  */
  44 #if (defined(__POWERPC__) || defined(__powerpc__))
  45 #if defined(__linux__)
  46 #include <unistd.h>
  47 #include <fcntl.h>
  48 #include <stdio.h>
  49 #include <elf.h>
  50 #include <linux/auxvec.h>
  51 #include <asm/cputable.h>
  52 #include <link.h>
  53
  54 static inline qcms_bool have_altivec() {
  55         static int available = -1;
  56         int new_avail = 0;
  57         ElfW(auxv_t) auxv;
  58         ssize_t count;
  59         int fd, i;
  60
  61         if (available != -1)
  62                 return (available != 0 ? true : false);
  63
  64         fd = open("/proc/self/auxv", O_RDONLY);
  65         if (fd < 0)
  66                 goto out;
  67         do {
  68                 count = read(fd, &auxv, sizeof(auxv));
  69                 if (count < 0)
  70                         goto out_close;
  71
  72                 if (auxv.a_type == AT_HWCAP) {
  73                         new_avail = !!(auxv.a_un.a_val & PPC_FEATURE_HAS_ALTIVEC);
  74                         goto out_close;
  75                 }
  76         } while (auxv.a_type != AT_NULL);
  77
  78 out_close:
  79         close(fd);
  80 out:
  81         available = new_avail;
  82         return (available != 0 ? true : false);
  83 }
  84 #elif defined(__APPLE__) && defined(__MACH__)
  85 #include <sys/sysctl.h>
  86
  87 /**
  88  * rip-off from ffmpeg AltiVec detection code.
  89  * this code also appears on Apple's AltiVec pages.
  90  */
  91 static inline qcms_bool have_altivec() {
  92         int sels[2] = {CTL_HW, HW_VECTORUNIT};
  93         static int available = -1;
  94         size_t len = sizeof(available);
  95         int err;
  96
  97         if (available != -1)
  98                 return (available != 0 ? true : false);
  99
 100         err = sysctl(sels, 2, &available, &len, NULL, 0);
 101
 102         if (err == 0)
 103                 if (available != 0)
 104                         return true;
 105
 106         return false;
 107 }
 108 #elif defined(__ALTIVEC__) || defined(__APPLE_ALTIVEC__)
 109 #define have_altivec() true
 110 #else
 111 #define have_altivec() false
 112 #endif
 113 #endif // (defined(__POWERPC__) || defined(__powerpc__))
 114
 115 // Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
 116 // This is just an approximation, I am not handling all the non-linear
 117 // aspects of the RGB to XYZ process, and assumming that the gamma correction
 118 // has transitive property in the tranformation chain.
 119 //
 120 // the alghoritm:
 121 //
 122 //            - First I build the absolute conversion matrix using
 123 //              primaries in XYZ. This matrix is next inverted
 124 //            - Then I eval the source white point across this matrix
 125 //              obtaining the coeficients of the transformation
 126 //            - Then, I apply these coeficients to the original matrix
 127 static struct matrix build_RGB_to_XYZ_transfer_matrix(qcms_CIE_xyY white, qcms_CIE_xyYTRIPLE primrs)
 128 {
 129         struct matrix primaries;
 130         struct matrix primaries_invert;
 131         struct matrix result;
 132         struct vector white_point;
 133         struct vector coefs;
 134
 135         double xn, yn;
 136         double xr, yr;
 137         double xg, yg;
 138         double xb, yb;
 139
 140         xn = white.x;
 141         yn = white.y;
 142
 143         if (yn == 0.0)
 144                 return matrix_invalid();
 145
 146         xr = primrs.red.x;
 147         yr = primrs.red.y;
 148         xg = primrs.green.x;
 149         yg = primrs.green.y;
 150         xb = primrs.blue.x;
 151         yb = primrs.blue.y;
 152
 153         primaries.m[0][0] = xr;
 154         primaries.m[0][1] = xg;
 155         primaries.m[0][2] = xb;
 156
 157         primaries.m[1][0] = yr;
 158         primaries.m[1][1] = yg;
 159         primaries.m[1][2] = yb;
 160
 161         primaries.m[2][0] = 1 - xr - yr;
 162         primaries.m[2][1] = 1 - xg - yg;
 163         primaries.m[2][2] = 1 - xb - yb;
 164         primaries.invalid = false;
 165
 166         white_point.v[0] = xn/yn;
 167         white_point.v[1] = 1.;
 168         white_point.v[2] = (1.0-xn-yn)/yn;
 169
 170         primaries_invert = matrix_invert(primaries);
 171         if (primaries_invert.invalid) {
 172                 return matrix_invalid();
 173         }
 174
 175         coefs = matrix_eval(primaries_invert, white_point);
 176
 177         result.m[0][0] = coefs.v[0]*xr;
 178         result.m[0][1] = coefs.v[1]*xg;
 179         result.m[0][2] = coefs.v[2]*xb;
 180
 181         result.m[1][0] = coefs.v[0]*yr;
 182         result.m[1][1] = coefs.v[1]*yg;
 183         result.m[1][2] = coefs.v[2]*yb;
 184
 185         result.m[2][0] = coefs.v[0]*(1.-xr-yr);
 186         result.m[2][1] = coefs.v[1]*(1.-xg-yg);
 187         result.m[2][2] = coefs.v[2]*(1.-xb-yb);
 188         result.invalid = primaries_invert.invalid;
 189
 190         return result;
 191 }
 192
 193 struct CIE_XYZ {
 194         double X;
 195         double Y;
 196         double Z;
 197 };
 198
 199 /* CIE Illuminant D50 */
 200 static const struct CIE_XYZ D50_XYZ = {
 201         0.9642,
 202         1.0000,
 203         0.8249
 204 };
 205
 206 /* from lcms: xyY2XYZ()
 207  * corresponds to argyll: icmYxy2XYZ() */
 208 static struct CIE_XYZ xyY2XYZ(qcms_CIE_xyY source)
 209 {
 210         struct CIE_XYZ dest;
 211         dest.X = (source.x / source.y) * source.Y;
 212         dest.Y = source.Y;
 213         dest.Z = ((1 - source.x - source.y) / source.y) * source.Y;
 214         return dest;
 215 }
 216
 217 /* from lcms: ComputeChromaticAdaption */
 218 // Compute chromatic adaption matrix using chad as cone matrix
 219 static struct matrix
 220 compute_chromatic_adaption(struct CIE_XYZ source_white_point,
 221                            struct CIE_XYZ dest_white_point,
 222                            struct matrix chad)
 223 {
 224         struct matrix chad_inv;
 225         struct vector cone_source_XYZ, cone_source_rgb;
 226         struct vector cone_dest_XYZ, cone_dest_rgb;
 227         struct matrix cone, tmp;
 228
 229         tmp = chad;
 230         chad_inv = matrix_invert(tmp);
 231         if (chad_inv.invalid) {
 232                 return matrix_invalid();
 233         }
 234
 235         cone_source_XYZ.v[0] = source_white_point.X;
 236         cone_source_XYZ.v[1] = source_white_point.Y;
 237         cone_source_XYZ.v[2] = source_white_point.Z;
 238
 239         cone_dest_XYZ.v[0] = dest_white_point.X;
 240         cone_dest_XYZ.v[1] = dest_white_point.Y;
 241         cone_dest_XYZ.v[2] = dest_white_point.Z;
 242
 243         cone_source_rgb = matrix_eval(chad, cone_source_XYZ);
 244         cone_dest_rgb   = matrix_eval(chad, cone_dest_XYZ);
 245
 246         cone.m[0][0] = cone_dest_rgb.v[0]/cone_source_rgb.v[0];
 247         cone.m[0][1] = 0;
 248         cone.m[0][2] = 0;
 249         cone.m[1][0] = 0;
 250         cone.m[1][1] = cone_dest_rgb.v[1]/cone_source_rgb.v[1];
 251         cone.m[1][2] = 0;
 252         cone.m[2][0] = 0;
 253         cone.m[2][1] = 0;
 254         cone.m[2][2] = cone_dest_rgb.v[2]/cone_source_rgb.v[2];
 255         cone.invalid = false;
 256
 257         // Normalize
 258         return matrix_multiply(chad_inv, matrix_multiply(cone, chad));
 259 }
 260
 261 /* from lcms: cmsAdaptionMatrix */
 262 // Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
 263 // Bradford is assumed
 264 static struct matrix
 265 adaption_matrix(struct CIE_XYZ source_illumination, struct CIE_XYZ target_illumination)
 266 {
 267         struct matrix lam_rigg = {{ // Bradford matrix
 268                                  {  0.8951f,  0.2664f, -0.1614f },
 269                                  { -0.7502f,  1.7135f,  0.0367f },
 270                                  {  0.0389f, -0.0685f,  1.0296f }
 271                                  }};
 272         return compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg);
 273 }
 274
 275 /* from lcms: cmsAdaptMatrixToD50 */
 276 static struct matrix adapt_matrix_to_D50(struct matrix r, qcms_CIE_xyY source_white_pt)
 277 {
 278         struct CIE_XYZ Dn;
 279         struct matrix Bradford;
 280
 281         if (source_white_pt.y == 0.0) {
 282                 return matrix_invalid();
 283         }
 284
 285         Dn = xyY2XYZ(source_white_pt);
 286
 287         Bradford = adaption_matrix(Dn, D50_XYZ);
 288         if (Bradford.invalid) {
 289                 return matrix_invalid();
 290         }
 291         return matrix_multiply(Bradford, r);
 292 }
 293
 294 qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
 295 {
 296         struct matrix colorants;
 297         colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
 298         colorants = adapt_matrix_to_D50(colorants, white_point);
 299
 300         if (colorants.invalid)
 301                 return false;
 302
 303         /* note: there's a transpose type of operation going on here */
 304         profile->redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0]);
 305         profile->redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0]);
 306         profile->redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0]);
 307
 308         profile->greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1]);
 309         profile->greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1]);
 310         profile->greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1]);
 311
 312         profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
 313         profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
 314         profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
 315
 316         return true;
 317 }
 318
 319 qcms_bool get_rgb_colorants(struct matrix *colorants, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
 320 {
 321         *colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
 322         *colorants = adapt_matrix_to_D50(*colorants, white_point);
 323
 324         return (colorants->invalid ? true : false);
 325 }
 326
 327 #if 0
 328 static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 329 {
 330         int i;
 331         float (*mat)[4] = transform->matrix;
 332         for (i=0; i<length; i++) {
 333                 unsigned char device_r = *src++;
 334                 unsigned char device_g = *src++;
 335                 unsigned char device_b = *src++;
 336
 337                 float linear_r = transform->input_gamma_table_r[device_r];
 338                 float linear_g = transform->input_gamma_table_g[device_g];
 339                 float linear_b = transform->input_gamma_table_b[device_b];
 340
 341                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 342                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 343                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 344
 345                 float out_device_r = pow(out_linear_r, transform->out_gamma_r);
 346                 float out_device_g = pow(out_linear_g, transform->out_gamma_g);
 347                 float out_device_b = pow(out_linear_b, transform->out_gamma_b);
 348
 349                 dest[OUTPUT_R_INDEX] = clamp_u8(255*out_device_r);
 350                 dest[OUTPUT_G_INDEX] = clamp_u8(255*out_device_g);
 351                 dest[OUTPUT_B_INDEX] = clamp_u8(255*out_device_b);
 352                 dest += RGB_OUTPUT_COMPONENTS;
 353         }
 354 }
 355 #endif
 356
 357 static void qcms_transform_data_gray_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 358 {
 359         unsigned int i;
 360         for (i = 0; i < length; i++) {
 361                 float out_device_r, out_device_g, out_device_b;
 362                 unsigned char device = *src++;
 363
 364                 float linear = transform->input_gamma_table_gray[device];
 365
 366                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 367                 out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 368                 out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 369
 370                 dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
 371                 dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
 372                 dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
 373                 dest += RGB_OUTPUT_COMPONENTS;
 374         }
 375 }
 376
 377 /* Alpha is not corrected.
 378    A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
 379    RGB Is?" Tech Memo 17 (December 14, 1998).
 380         See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
 381 */
 382
 383 static void qcms_transform_data_graya_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 384 {
 385         unsigned int i;
 386         for (i = 0; i < length; i++) {
 387                 float out_device_r, out_device_g, out_device_b;
 388                 unsigned char device = *src++;
 389                 unsigned char alpha = *src++;
 390
 391                 float linear = transform->input_gamma_table_gray[device];
 392
 393                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 394                 out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 395                 out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 396
 397                 dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
 398                 dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
 399                 dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
 400                 dest[OUTPUT_A_INDEX] = alpha;
 401                 dest += RGBA_OUTPUT_COMPONENTS;
 402         }
 403 }
 404
 405
 406 static void qcms_transform_data_gray_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 407 {
 408         unsigned int i;
 409         for (i = 0; i < length; i++) {
 410                 unsigned char device = *src++;
 411                 uint16_t gray;
 412
 413                 float linear = transform->input_gamma_table_gray[device];
 414
 415                 /* we could round here... */
 416                 gray = linear * PRECACHE_OUTPUT_MAX;
 417
 418                 dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
 419                 dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
 420                 dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
 421                 dest += RGB_OUTPUT_COMPONENTS;
 422         }
 423 }
 424
 425 static void qcms_transform_data_graya_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 426 {
 427         unsigned int i;
 428         for (i = 0; i < length; i++) {
 429                 unsigned char device = *src++;
 430                 unsigned char alpha = *src++;
 431                 uint16_t gray;
 432
 433                 float linear = transform->input_gamma_table_gray[device];
 434
 435                 /* we could round here... */
 436                 gray = linear * PRECACHE_OUTPUT_MAX;
 437
 438                 dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
 439                 dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
 440                 dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
 441                 dest[OUTPUT_A_INDEX] = alpha;
 442                 dest += RGBA_OUTPUT_COMPONENTS;
 443         }
 444 }
 445
 446 static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 447 {
 448         unsigned int i;
 449         float (*mat)[4] = transform->matrix;
 450         for (i = 0; i < length; i++) {
 451                 unsigned char device_r = *src++;
 452                 unsigned char device_g = *src++;
 453                 unsigned char device_b = *src++;
 454                 uint16_t r, g, b;
 455
 456                 float linear_r = transform->input_gamma_table_r[device_r];
 457                 float linear_g = transform->input_gamma_table_g[device_g];
 458                 float linear_b = transform->input_gamma_table_b[device_b];
 459
 460                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 461                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 462                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 463
 464                 out_linear_r = clamp_float(out_linear_r);
 465                 out_linear_g = clamp_float(out_linear_g);
 466                 out_linear_b = clamp_float(out_linear_b);
 467
 468                 /* we could round here... */
 469                 r = out_linear_r * PRECACHE_OUTPUT_MAX;
 470                 g = out_linear_g * PRECACHE_OUTPUT_MAX;
 471                 b = out_linear_b * PRECACHE_OUTPUT_MAX;
 472
 473                 dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
 474                 dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
 475                 dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
 476                 dest += RGB_OUTPUT_COMPONENTS;
 477         }
 478 }
 479
 480 static void qcms_transform_data_rgba_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 481 {
 482         unsigned int i;
 483         float (*mat)[4] = transform->matrix;
 484         for (i = 0; i < length; i++) {
 485                 unsigned char device_r = *src++;
 486                 unsigned char device_g = *src++;
 487                 unsigned char device_b = *src++;
 488                 unsigned char alpha = *src++;
 489                 uint16_t r, g, b;
 490
 491                 float linear_r = transform->input_gamma_table_r[device_r];
 492                 float linear_g = transform->input_gamma_table_g[device_g];
 493                 float linear_b = transform->input_gamma_table_b[device_b];
 494
 495                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 496                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 497                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 498
 499                 out_linear_r = clamp_float(out_linear_r);
 500                 out_linear_g = clamp_float(out_linear_g);
 501                 out_linear_b = clamp_float(out_linear_b);
 502
 503                 /* we could round here... */
 504                 r = out_linear_r * PRECACHE_OUTPUT_MAX;
 505                 g = out_linear_g * PRECACHE_OUTPUT_MAX;
 506                 b = out_linear_b * PRECACHE_OUTPUT_MAX;
 507
 508                 dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
 509                 dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
 510                 dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
 511                 dest[OUTPUT_A_INDEX] = alpha;
 512                 dest += RGBA_OUTPUT_COMPONENTS;
 513         }
 514 }
 515
 516 // Not used
 517 /*
 518 static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
 519         unsigned int i;
 520         int xy_len = 1;
 521         int x_len = transform->grid_size;
 522         int len = x_len * x_len;
 523         float* r_table = transform->r_clut;
 524         float* g_table = transform->g_clut;
 525         float* b_table = transform->b_clut;
 526
 527         for (i = 0; i < length; i++) {
 528                 unsigned char in_r = *src++;
 529                 unsigned char in_g = *src++;
 530                 unsigned char in_b = *src++;
 531                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 532
 533                 int x = floorf(linear_r * (transform->grid_size-1));
 534                 int y = floorf(linear_g * (transform->grid_size-1));
 535                 int z = floorf(linear_b * (transform->grid_size-1));
 536                 int x_n = ceilf(linear_r * (transform->grid_size-1));
 537                 int y_n = ceilf(linear_g * (transform->grid_size-1));
 538                 int z_n = ceilf(linear_b * (transform->grid_size-1));
 539                 float x_d = linear_r * (transform->grid_size-1) - x;
 540                 float y_d = linear_g * (transform->grid_size-1) - y;
 541                 float z_d = linear_b * (transform->grid_size-1) - z;
 542
 543                 float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
 544                 float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
 545                 float r_y1 = lerp(r_x1, r_x2, y_d);
 546                 float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
 547                 float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
 548                 float r_y2 = lerp(r_x3, r_x4, y_d);
 549                 float clut_r = lerp(r_y1, r_y2, z_d);
 550
 551                 float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
 552                 float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
 553                 float g_y1 = lerp(g_x1, g_x2, y_d);
 554                 float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
 555                 float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
 556                 float g_y2 = lerp(g_x3, g_x4, y_d);
 557                 float clut_g = lerp(g_y1, g_y2, z_d);
 558
 559                 float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
 560                 float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
 561                 float b_y1 = lerp(b_x1, b_x2, y_d);
 562                 float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
 563                 float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
 564                 float b_y2 = lerp(b_x3, b_x4, y_d);
 565                 float clut_b = lerp(b_y1, b_y2, z_d);
 566
 567                 *dest++ = clamp_u8(clut_r*255.0f);
 568                 *dest++ = clamp_u8(clut_g*255.0f);
 569                 *dest++ = clamp_u8(clut_b*255.0f);
 570         }
 571 }
 572 */
 573
 574 static int int_div_ceil(int value, int div) {
 575         return ((value  + div - 1) / div);
 576 }
 577
 578 // Using lcms' tetra interpolation algorithm.
 579 static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
 580         unsigned int i;
 581         int xy_len = 1;
 582         int x_len = transform->grid_size;
 583         int len = x_len * x_len;
 584         float* r_table = transform->r_clut;
 585         float* g_table = transform->g_clut;
 586         float* b_table = transform->b_clut;
 587         float c0_r, c1_r, c2_r, c3_r;
 588         float c0_g, c1_g, c2_g, c3_g;
 589         float c0_b, c1_b, c2_b, c3_b;
 590         float clut_r, clut_g, clut_b;
 591         for (i = 0; i < length; i++) {
 592                 unsigned char in_r = *src++;
 593                 unsigned char in_g = *src++;
 594                 unsigned char in_b = *src++;
 595                 unsigned char in_a = *src++;
 596                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 597
 598                 int x = in_r * (transform->grid_size-1) / 255;
 599                 int y = in_g * (transform->grid_size-1) / 255;
 600                 int z = in_b * (transform->grid_size-1) / 255;
 601                 int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
 602                 int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
 603                 int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
 604                 float rx = linear_r * (transform->grid_size-1) - x;
 605                 float ry = linear_g * (transform->grid_size-1) - y;
 606                 float rz = linear_b * (transform->grid_size-1) - z;
 607
 608                 c0_r = CLU(r_table, x, y, z);
 609                 c0_g = CLU(g_table, x, y, z);
 610                 c0_b = CLU(b_table, x, y, z);
 611
 612                 if( rx >= ry ) {
 613                         if (ry >= rz) { //rx >= ry && ry >= rz
 614                                 c1_r = CLU(r_table, x_n, y, z) - c0_r;
 615                                 c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
 616                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 617                                 c1_g = CLU(g_table, x_n, y, z) - c0_g;
 618                                 c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
 619                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 620                                 c1_b = CLU(b_table, x_n, y, z) - c0_b;
 621                                 c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
 622                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 623                         } else {
 624                                 if (rx >= rz) { //rx >= rz && rz >= ry
 625                                         c1_r = CLU(r_table, x_n, y, z) - c0_r;
 626                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 627                                         c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
 628                                         c1_g = CLU(g_table, x_n, y, z) - c0_g;
 629                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 630                                         c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
 631                                         c1_b = CLU(b_table, x_n, y, z) - c0_b;
 632                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 633                                         c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
 634                                 } else { //rz > rx && rx >= ry
 635                                         c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
 636                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 637                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 638                                         c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
 639                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 640                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 641                                         c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
 642                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 643                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 644                                 }
 645                         }
 646                 } else {
 647                         if (rx >= rz) { //ry > rx && rx >= rz
 648                                 c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
 649                                 c2_r = CLU(r_table, x, y_n, z) - c0_r;
 650                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 651                                 c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
 652                                 c2_g = CLU(g_table, x, y_n, z) - c0_g;
 653                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 654                                 c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
 655                                 c2_b = CLU(b_table, x, y_n, z) - c0_b;
 656                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 657                         } else {
 658                                 if (ry >= rz) { //ry >= rz && rz > rx
 659                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 660                                         c2_r = CLU(r_table, x, y_n, z) - c0_r;
 661                                         c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
 662                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 663                                         c2_g = CLU(g_table, x, y_n, z) - c0_g;
 664                                         c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
 665                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 666                                         c2_b = CLU(b_table, x, y_n, z) - c0_b;
 667                                         c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
 668                                 } else { //rz > ry && ry > rx
 669                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 670                                         c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
 671                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 672                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 673                                         c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
 674                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 675                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 676                                         c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
 677                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 678                                 }
 679                         }
 680                 }
 681
 682                 clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
 683                 clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
 684                 clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
 685
 686                 dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
 687                 dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
 688                 dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
 689                 dest[OUTPUT_A_INDEX] = in_a;
 690                 dest += RGBA_OUTPUT_COMPONENTS;
 691         }
 692 }
 693
 694 // Using lcms' tetra interpolation code.
 695 static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
 696         unsigned int i;
 697         int xy_len = 1;
 698         int x_len = transform->grid_size;
 699         int len = x_len * x_len;
 700         float* r_table = transform->r_clut;
 701         float* g_table = transform->g_clut;
 702         float* b_table = transform->b_clut;
 703         float c0_r, c1_r, c2_r, c3_r;
 704         float c0_g, c1_g, c2_g, c3_g;
 705         float c0_b, c1_b, c2_b, c3_b;
 706         float clut_r, clut_g, clut_b;
 707         for (i = 0; i < length; i++) {
 708                 unsigned char in_r = *src++;
 709                 unsigned char in_g = *src++;
 710                 unsigned char in_b = *src++;
 711                 float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 712
 713                 int x = in_r * (transform->grid_size-1) / 255;
 714                 int y = in_g * (transform->grid_size-1) / 255;
 715                 int z = in_b * (transform->grid_size-1) / 255;
 716                 int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
 717                 int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
 718                 int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
 719                 float rx = linear_r * (transform->grid_size-1) - x;
 720                 float ry = linear_g * (transform->grid_size-1) - y;
 721                 float rz = linear_b * (transform->grid_size-1) - z;
 722
 723                 c0_r = CLU(r_table, x, y, z);
 724                 c0_g = CLU(g_table, x, y, z);
 725                 c0_b = CLU(b_table, x, y, z);
 726
 727                 if( rx >= ry ) {
 728                         if (ry >= rz) { //rx >= ry && ry >= rz
 729                                 c1_r = CLU(r_table, x_n, y, z) - c0_r;
 730                                 c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
 731                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 732                                 c1_g = CLU(g_table, x_n, y, z) - c0_g;
 733                                 c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
 734                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 735                                 c1_b = CLU(b_table, x_n, y, z) - c0_b;
 736                                 c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
 737                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 738                         } else {
 739                                 if (rx >= rz) { //rx >= rz && rz >= ry
 740                                         c1_r = CLU(r_table, x_n, y, z) - c0_r;
 741                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 742                                         c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
 743                                         c1_g = CLU(g_table, x_n, y, z) - c0_g;
 744                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 745                                         c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
 746                                         c1_b = CLU(b_table, x_n, y, z) - c0_b;
 747                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 748                                         c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
 749                                 } else { //rz > rx && rx >= ry
 750                                         c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
 751                                         c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
 752                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 753                                         c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
 754                                         c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
 755                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 756                                         c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
 757                                         c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
 758                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 759                                 }
 760                         }
 761                 } else {
 762                         if (rx >= rz) { //ry > rx && rx >= rz
 763                                 c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
 764                                 c2_r = CLU(r_table, x, y_n, z) - c0_r;
 765                                 c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
 766                                 c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
 767                                 c2_g = CLU(g_table, x, y_n, z) - c0_g;
 768                                 c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
 769                                 c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
 770                                 c2_b = CLU(b_table, x, y_n, z) - c0_b;
 771                                 c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
 772                         } else {
 773                                 if (ry >= rz) { //ry >= rz && rz > rx
 774                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 775                                         c2_r = CLU(r_table, x, y_n, z) - c0_r;
 776                                         c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
 777                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 778                                         c2_g = CLU(g_table, x, y_n, z) - c0_g;
 779                                         c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
 780                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 781                                         c2_b = CLU(b_table, x, y_n, z) - c0_b;
 782                                         c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
 783                                 } else { //rz > ry && ry > rx
 784                                         c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
 785                                         c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
 786                                         c3_r = CLU(r_table, x, y, z_n) - c0_r;
 787                                         c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
 788                                         c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
 789                                         c3_g = CLU(g_table, x, y, z_n) - c0_g;
 790                                         c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
 791                                         c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
 792                                         c3_b = CLU(b_table, x, y, z_n) - c0_b;
 793                                 }
 794                         }
 795                 }
 796
 797                 clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
 798                 clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
 799                 clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
 800
 801                 dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
 802                 dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
 803                 dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
 804                 dest += RGB_OUTPUT_COMPONENTS;
 805         }
 806 }
 807
 808 static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 809 {
 810         unsigned int i;
 811         float (*mat)[4] = transform->matrix;
 812         for (i = 0; i < length; i++) {
 813                 unsigned char device_r = *src++;
 814                 unsigned char device_g = *src++;
 815                 unsigned char device_b = *src++;
 816                 float out_device_r, out_device_g, out_device_b;
 817
 818                 float linear_r = transform->input_gamma_table_r[device_r];
 819                 float linear_g = transform->input_gamma_table_g[device_g];
 820                 float linear_b = transform->input_gamma_table_b[device_b];
 821
 822                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 823                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 824                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 825
 826                 out_linear_r = clamp_float(out_linear_r);
 827                 out_linear_g = clamp_float(out_linear_g);
 828                 out_linear_b = clamp_float(out_linear_b);
 829
 830                 out_device_r = lut_interp_linear(out_linear_r,
 831                                 transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 832                 out_device_g = lut_interp_linear(out_linear_g,
 833                                 transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 834                 out_device_b = lut_interp_linear(out_linear_b,
 835                                 transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 836
 837                 dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
 838                 dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
 839                 dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
 840                 dest += RGB_OUTPUT_COMPONENTS;
 841         }
 842 }
 843
 844 static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 845 {
 846         unsigned int i;
 847         float (*mat)[4] = transform->matrix;
 848         for (i = 0; i < length; i++) {
 849                 unsigned char device_r = *src++;
 850                 unsigned char device_g = *src++;
 851                 unsigned char device_b = *src++;
 852                 unsigned char alpha = *src++;
 853                 float out_device_r, out_device_g, out_device_b;
 854
 855                 float linear_r = transform->input_gamma_table_r[device_r];
 856                 float linear_g = transform->input_gamma_table_g[device_g];
 857                 float linear_b = transform->input_gamma_table_b[device_b];
 858
 859                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 860                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 861                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 862
 863                 out_linear_r = clamp_float(out_linear_r);
 864                 out_linear_g = clamp_float(out_linear_g);
 865                 out_linear_b = clamp_float(out_linear_b);
 866
 867                 out_device_r = lut_interp_linear(out_linear_r,
 868                                 transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 869                 out_device_g = lut_interp_linear(out_linear_g,
 870                                 transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 871                 out_device_b = lut_interp_linear(out_linear_b,
 872                                 transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 873
 874                 dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
 875                 dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
 876                 dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
 877                 dest[OUTPUT_A_INDEX] = alpha;
 878                 dest += RGBA_OUTPUT_COMPONENTS;
 879         }
 880 }
 881
 882 #if 0
 883 static void qcms_transform_data_rgb_out_linear(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 884 {
 885         int i;
 886         float (*mat)[4] = transform->matrix;
 887         for (i = 0; i < length; i++) {
 888                 unsigned char device_r = *src++;
 889                 unsigned char device_g = *src++;
 890                 unsigned char device_b = *src++;
 891
 892                 float linear_r = transform->input_gamma_table_r[device_r];
 893                 float linear_g = transform->input_gamma_table_g[device_g];
 894                 float linear_b = transform->input_gamma_table_b[device_b];
 895
 896                 float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 897                 float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 898                 float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 899
 900                 *dest++ = clamp_u8(out_linear_r*255);
 901                 *dest++ = clamp_u8(out_linear_g*255);
 902                 *dest++ = clamp_u8(out_linear_b*255);
 903         }
 904 }
 905 #endif
 906
 907 /*
 908  * If users create and destroy objects on different threads, even if the same
 909  * objects aren't used on different threads at the same time, we can still run
 910  * in to trouble with refcounts if they aren't atomic.
 911  *
 912  * This can lead to us prematurely deleting the precache if threads get unlucky
 913  * and write the wrong value to the ref count.
 914  */
 915 static struct precache_output *precache_reference(struct precache_output *p)
 916 {
 917         qcms_atomic_increment(p->ref_count);
 918         return p;
 919 }
 920
 921 static struct precache_output *precache_create()
 922 {
 923         struct precache_output *p = malloc(sizeof(struct precache_output));
 924         if (p)
 925                 p->ref_count = 1;
 926         return p;
 927 }
 928
 929 void precache_release(struct precache_output *p)
 930 {
 931         if (qcms_atomic_decrement(p->ref_count) == 0) {
 932                 free(p);
 933         }
 934 }
 935
 936 #ifdef HAVE_POSIX_MEMALIGN
 937 static qcms_transform *transform_alloc(void)
 938 {
 939         qcms_transform *t;
 940
 941         void *allocated_memory;
 942         if (!posix_memalign(&allocated_memory, 16, sizeof(qcms_transform))) {
 943                 /* Doing a memset to initialise all bits to 'zero'*/
 944                 memset(allocated_memory, 0, sizeof(qcms_transform));
 945                 t = allocated_memory;
 946                 return t;
 947         } else {
 948                 return NULL;
 949         }
 950 }
 951 static void transform_free(qcms_transform *t)
 952 {
 953         free(t);
 954 }
 955 #else
 956 static qcms_transform *transform_alloc(void)
 957 {
 958         /* transform needs to be aligned on a 16byte boundrary */
 959         char *original_block = calloc(sizeof(qcms_transform) + sizeof(void*) + 16, 1);
 960         /* make room for a pointer to the block returned by calloc */
 961         void *transform_start = original_block + sizeof(void*);
 962         /* align transform_start */
 963         qcms_transform *transform_aligned = (qcms_transform*)(((uintptr_t)transform_start + 15) & ~0xf);
 964
 965         /* store a pointer to the block returned by calloc so that we can free it later */
 966         void **(original_block_ptr) = (void**)transform_aligned;
 967         if (!original_block)
 968                 return NULL;
 969         original_block_ptr--;
 970         *original_block_ptr = original_block;
 971
 972         return transform_aligned;
 973 }
 974 static void transform_free(qcms_transform *t)
 975 {
 976         /* get at the pointer to the unaligned block returned by calloc */
 977         void **p = (void**)t;
 978         p--;
 979         free(*p);
 980 }
 981 #endif
 982
 983 void qcms_transform_release(qcms_transform *t)
 984 {
 985         /* ensure we only free the gamma tables once even if there are
 986          * multiple references to the same data */
 987
 988         if (t->output_table_r)
 989                 precache_release(t->output_table_r);
 990         if (t->output_table_g)
 991                 precache_release(t->output_table_g);
 992         if (t->output_table_b)
 993                 precache_release(t->output_table_b);
 994
 995         free(t->input_gamma_table_r);
 996         if (t->input_gamma_table_g != t->input_gamma_table_r)
 997                 free(t->input_gamma_table_g);
 998         if (t->input_gamma_table_g != t->input_gamma_table_r &&
 999             t->input_gamma_table_g != t->input_gamma_table_b)
1000                 free(t->input_gamma_table_b);
1001
1002         free(t->input_gamma_table_gray);
1003
1004         free(t->output_gamma_lut_r);
1005         free(t->output_gamma_lut_g);
1006         free(t->output_gamma_lut_b);
1007
1008         /* r_clut points to beginning of buffer allocated in qcms_transform_precacheLUT_float */
1009         if (t->r_clut)
1010                 free(t->r_clut);
1011
1012         transform_free(t);
1013 }
1014
1015 #ifdef X86
1016 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
1017 // mozilla/jpeg)
1018  // -------------------------------------------------------------------------
1019 #if defined(_M_IX86) && defined(_MSC_VER)
1020 #define HAS_CPUID
1021 /* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
1022    register - I'm not sure if that ever happens on windows, but cpuid isn't
1023    on the critical path so we just preserve the register to be safe and to be
1024    consistent with the non-windows version. */
1025 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
1026        uint32_t a_, b_, c_, d_;
1027        __asm {
1028               xchg   ebx, esi
1029               mov    eax, fxn
1030               cpuid
1031               mov    a_, eax
1032               mov    b_, ebx
1033               mov    c_, ecx
1034               mov    d_, edx
1035               xchg   ebx, esi
1036        }
1037        *a = a_;
1038        *b = b_;
1039        *c = c_;
1040        *d = d_;
1041 }
1042 #elif (defined(__GNUC__) || defined(__SUNPRO_C)) && (defined(__i386__) || defined(__i386))
1043 #define HAS_CPUID
1044 /* Get us a CPUID function. We can't use ebx because it's the PIC register on
1045    some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
1046 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
1047
1048         uint32_t a_, b_, c_, d_;
1049        __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;"
1050                              : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
1051            *a = a_;
1052            *b = b_;
1053            *c = c_;
1054            *d = d_;
1055 }
1056 #endif
1057
1058 // -------------------------Runtime SSEx Detection-----------------------------
1059
1060 /* MMX is always supported per
1061  *  Gecko v1.9.1 minimum CPU requirements */
1062 #define SSE1_EDX_MASK (1UL << 25)
1063 #define SSE2_EDX_MASK (1UL << 26)
1064 #define SSE3_ECX_MASK (1UL <<  0)
1065
1066 static int sse_version_available(void)
1067 {
1068 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
1069         /* we know at build time that 64-bit CPUs always have SSE2
1070          * this tells the compiler that non-SSE2 branches will never be
1071          * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
1072         return 2;
1073 #elif defined(HAS_CPUID)
1074         static int sse_version = -1;
1075         uint32_t a, b, c, d;
1076         uint32_t function = 0x00000001;
1077
1078         if (sse_version == -1) {
1079                 sse_version = 0;
1080                 cpuid(function, &a, &b, &c, &d);
1081                 if (c & SSE3_ECX_MASK)
1082                         sse_version = 3;
1083                 else if (d & SSE2_EDX_MASK)
1084                         sse_version = 2;
1085                 else if (d & SSE1_EDX_MASK)
1086                         sse_version = 1;
1087         }
1088
1089         return sse_version;
1090 #else
1091         return 0;
1092 #endif
1093 }
1094 #endif
1095
1096 static const struct matrix bradford_matrix = {{ { 0.8951f, 0.2664f,-0.1614f},
1097                                                 {-0.7502f, 1.7135f, 0.0367f},
1098                                                 { 0.0389f,-0.0685f, 1.0296f}},
1099                                                 false};
1100
1101 static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
1102                                                     { 0.4323053f, 0.5183603f, 0.0492912f},
1103                                                     {-0.0085287f, 0.0400428f, 0.9684867f}},
1104                                                     false};
1105
1106 // See ICCv4 E.3
1107 struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
1108         float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
1109                   (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
1110         float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
1111                   (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
1112         float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
1113                   (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
1114         struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
1115         return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
1116 }
1117
1118 void qcms_profile_precache_output_transform(qcms_profile *profile)
1119 {
1120         /* we only support precaching on rgb profiles */
1121         if (profile->color_space != RGB_SIGNATURE)
1122                 return;
1123
1124         if (qcms_supports_iccv4) {
1125                 /* don't precache since we will use the B2A LUT */
1126                 if (profile->B2A0)
1127                         return;
1128
1129                 /* don't precache since we will use the mBA LUT */
1130                 if (profile->mBA)
1131                         return;
1132         }
1133
1134         /* don't precache if we do not have the TRC curves */
1135         if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
1136                 return;
1137
1138         if (!profile->output_table_r) {
1139                 profile->output_table_r = precache_create();
1140                 if (profile->output_table_r &&
1141                                 !compute_precache(profile->redTRC, profile->output_table_r->data)) {
1142                         precache_release(profile->output_table_r);
1143                         profile->output_table_r = NULL;
1144                 }
1145         }
1146         if (!profile->output_table_g) {
1147                 profile->output_table_g = precache_create();
1148                 if (profile->output_table_g &&
1149                                 !compute_precache(profile->greenTRC, profile->output_table_g->data)) {
1150                         precache_release(profile->output_table_g);
1151                         profile->output_table_g = NULL;
1152                 }
1153         }
1154         if (!profile->output_table_b) {
1155                 profile->output_table_b = precache_create();
1156                 if (profile->output_table_b &&
1157                                 !compute_precache(profile->blueTRC, profile->output_table_b->data)) {
1158                         precache_release(profile->output_table_b);
1159                         profile->output_table_b = NULL;
1160                 }
1161         }
1162 }
1163
1164 /* Replace the current transformation with a LUT transformation using a given number of sample points */
1165 qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out,
1166                                                  int samples, qcms_data_type in_type)
1167 {
1168         /* The range between which 2 consecutive sample points can be used to interpolate */
1169         uint16_t x,y,z;
1170         uint32_t l;
1171         uint32_t lutSize = 3 * samples * samples * samples;
1172         float* src = NULL;
1173         float* dest = NULL;
1174         float* lut = NULL;
1175
1176         src = malloc(lutSize*sizeof(float));
1177         dest = malloc(lutSize*sizeof(float));
1178
1179         if (src && dest) {
1180                 /* Prepare a list of points we want to sample */
1181                 l = 0;
1182                 for (x = 0; x < samples; x++) {
1183                         for (y = 0; y < samples; y++) {
1184                                 for (z = 0; z < samples; z++) {
1185                                         src[l++] = x / (float)(samples-1);
1186                                         src[l++] = y / (float)(samples-1);
1187                                         src[l++] = z / (float)(samples-1);
1188                                 }
1189                         }
1190                 }
1191
1192                 lut = qcms_chain_transform(in, out, src, dest, lutSize);
1193                 if (lut) {
1194                         transform->r_clut = &lut[0];
1195                         transform->g_clut = &lut[1];
1196                         transform->b_clut = &lut[2];
1197                         transform->grid_size = samples;
1198                         if (in_type == QCMS_DATA_RGBA_8) {
1199                                 transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
1200                         } else {
1201                                 transform->transform_fn = qcms_transform_data_tetra_clut;
1202                         }
1203                 }
1204         }
1205
1206
1207         //XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
1208         // It will be stored in r_clut, which will be cleaned up in qcms_transform_release.
1209         if (src && lut != src) {
1210                 free(src);
1211         }
1212         if (dest && lut != dest) {
1213                 free(dest);
1214         }
1215
1216         if (lut == NULL) {
1217                 return NULL;
1218         }
1219         return transform;
1220 }
1221
1222 #define NO_MEM_TRANSFORM NULL
1223
1224 qcms_transform* qcms_transform_create(
1225                 qcms_profile *in, qcms_data_type in_type,
1226                 qcms_profile *out, qcms_data_type out_type,
1227                 qcms_intent intent)
1228 {
1229         bool precache = false;
1230
1231         qcms_transform *transform = transform_alloc();
1232         if (!transform) {
1233                 return NULL;
1234         }
1235         if (out_type != QCMS_DATA_RGB_8 &&
1236                 out_type != QCMS_DATA_RGBA_8) {
1237             assert(0 && "output type");
1238             qcms_transform_release(transform);
1239             return NULL;
1240         }
1241
1242         if (out->output_table_r &&
1243                         out->output_table_g &&
1244                         out->output_table_b) {
1245                 precache = true;
1246         }
1247
1248         // This precache assumes RGB_SIGNATURE (fails on GRAY_SIGNATURE, for instance)
1249         if (qcms_supports_iccv4 &&
1250                         (in_type == QCMS_DATA_RGB_8 || in_type == QCMS_DATA_RGBA_8) &&
1251                         (in->A2B0 || out->B2A0 || in->mAB || out->mAB))
1252                 {
1253                 // Precache the transformation to a CLUT 33x33x33 in size.
1254                 // 33 is used by many profiles and works well in pratice.
1255                 // This evenly divides 256 into blocks of 8x8x8.
1256                 // TODO For transforming small data sets of about 200x200 or less
1257                 // precaching should be avoided.
1258                 qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
1259                 if (!result) {
1260                         assert(0 && "precacheLUT failed");
1261                         qcms_transform_release(transform);
1262                         return NULL;
1263                 }
1264                 return result;
1265         }
1266
1267         if (precache) {
1268                 transform->output_table_r = precache_reference(out->output_table_r);
1269                 transform->output_table_g = precache_reference(out->output_table_g);
1270                 transform->output_table_b = precache_reference(out->output_table_b);
1271         } else {
1272                 if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
1273                         qcms_transform_release(transform);
1274                         return NO_MEM_TRANSFORM;
1275                 }
1276                 build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
1277                 build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
1278                 build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
1279                 if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
1280                         qcms_transform_release(transform);
1281                         return NO_MEM_TRANSFORM;
1282                 }
1283         }
1284
1285         if (in->color_space == RGB_SIGNATURE) {
1286                 struct matrix in_matrix, out_matrix, result;
1287
1288                 if (in_type != QCMS_DATA_RGB_8 &&
1289                     in_type != QCMS_DATA_RGBA_8){
1290                         assert(0 && "input type");
1291                         qcms_transform_release(transform);
1292                         return NULL;
1293                 }
1294                 if (precache) {
1295 #ifdef X86
1296                     if (sse_version_available() >= 2) {
1297                             if (in_type == QCMS_DATA_RGB_8)
1298                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
1299                             else
1300                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
1301
1302 #if !(defined(_MSC_VER) && defined(_M_AMD64))
1303                     /* Microsoft Compiler for x64 doesn't support MMX.
1304                      * SSE code uses MMX so that we disable on x64 */
1305                     } else
1306                     if (sse_version_available() >= 1) {
1307                             if (in_type == QCMS_DATA_RGB_8)
1308                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
1309                             else
1310                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
1311 #endif
1312                     } else
1313 #endif
1314 #if (defined(__POWERPC__) || defined(__powerpc__) && !defined(__NO_FPRS__))
1315                     if (have_altivec()) {
1316                             if (in_type == QCMS_DATA_RGB_8)
1317                                     transform->transform_fn = qcms_transform_data_rgb_out_lut_altivec;
1318                             else
1319                                     transform->transform_fn = qcms_transform_data_rgba_out_lut_altivec;
1320                     } else
1321 #endif
1322                         {
1323                                 if (in_type == QCMS_DATA_RGB_8)
1324                                         transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
1325                                 else
1326                                         transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
1327                         }
1328                 } else {
1329                         if (in_type == QCMS_DATA_RGB_8)
1330                                 transform->transform_fn = qcms_transform_data_rgb_out_lut;
1331                         else
1332                                 transform->transform_fn = qcms_transform_data_rgba_out_lut;
1333                 }
1334
1335                 //XXX: avoid duplicating tables if we can
1336                 transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
1337                 transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
1338                 transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
1339                 if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
1340                         qcms_transform_release(transform);
1341                         return NO_MEM_TRANSFORM;
1342                 }
1343
1344
1345                 /* build combined colorant matrix */
1346                 in_matrix = build_colorant_matrix(in);
1347                 out_matrix = build_colorant_matrix(out);
1348                 out_matrix = matrix_invert(out_matrix);
1349                 if (out_matrix.invalid) {
1350                         qcms_transform_release(transform);
1351                         return NULL;
1352                 }
1353                 result = matrix_multiply(out_matrix, in_matrix);
1354
1355                 /* check for NaN values in the matrix and bail if we find any */
1356                 for (unsigned i = 0 ; i < 3 ; ++i) {
1357                         for (unsigned j = 0 ; j < 3 ; ++j) {
1358                                 if (result.m[i][j] != result.m[i][j]) {
1359                                         qcms_transform_release(transform);
1360                                         return NULL;
1361                                 }
1362                         }
1363                 }
1364
1365                 /* store the results in column major mode
1366                  * this makes doing the multiplication with sse easier */
1367                 transform->matrix[0][0] = result.m[0][0];
1368                 transform->matrix[1][0] = result.m[0][1];
1369                 transform->matrix[2][0] = result.m[0][2];
1370                 transform->matrix[0][1] = result.m[1][0];
1371                 transform->matrix[1][1] = result.m[1][1];
1372                 transform->matrix[2][1] = result.m[1][2];
1373                 transform->matrix[0][2] = result.m[2][0];
1374                 transform->matrix[1][2] = result.m[2][1];
1375                 transform->matrix[2][2] = result.m[2][2];
1376
1377         } else if (in->color_space == GRAY_SIGNATURE) {
1378                 if (in_type != QCMS_DATA_GRAY_8 &&
1379                                 in_type != QCMS_DATA_GRAYA_8){
1380                         assert(0 && "input type");
1381                         qcms_transform_release(transform);
1382                         return NULL;
1383                 }
1384
1385                 transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
1386                 if (!transform->input_gamma_table_gray) {
1387                         qcms_transform_release(transform);
1388                         return NO_MEM_TRANSFORM;
1389                 }
1390
1391                 if (precache) {
1392                         if (in_type == QCMS_DATA_GRAY_8) {
1393                                 transform->transform_fn = qcms_transform_data_gray_out_precache;
1394                         } else {
1395                                 transform->transform_fn = qcms_transform_data_graya_out_precache;
1396                         }
1397                 } else {
1398                         if (in_type == QCMS_DATA_GRAY_8) {
1399                                 transform->transform_fn = qcms_transform_data_gray_out_lut;
1400                         } else {
1401                                 transform->transform_fn = qcms_transform_data_graya_out_lut;
1402                         }
1403                 }
1404         } else {
1405                 assert(0 && "unexpected colorspace");
1406                 qcms_transform_release(transform);
1407                 return NULL;
1408         }
1409         return transform;
1410 }
1411
1412 #if defined(__GNUC__) && defined(__i386__)
1413 /* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
1414 __attribute__((__force_align_arg_pointer__))
1415 #endif
1416 void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
1417 {
1418         transform->transform_fn(transform, src, dest, length);
1419 }
1420
1421 qcms_bool qcms_supports_iccv4;
1422 void qcms_enable_iccv4()
1423 {
1424         qcms_supports_iccv4 = true;
1425 }