Bug 496271, automation config for Tb2.0.0.22 build1, p=joduinn, r=me
[mozilla-1.9.git] / jpeg / jdmerge.c
blob3239ddbde82ddbd89bec4268eeeb6572c8e44e1a
1 /*
2 * jdmerge.c
4 * Copyright (C) 1994-1996, Thomas G. Lane.
5 * This file is part of the Independent JPEG Group's software.
6 * For conditions of distribution and use, see the accompanying README file.
8 * This file contains code for merged upsampling/color conversion.
10 * This file combines functions from jdsample.c and jdcolor.c;
11 * read those files first to understand what's going on.
13 * When the chroma components are to be upsampled by simple replication
14 * (ie, box filtering), we can save some work in color conversion by
15 * calculating all the output pixels corresponding to a pair of chroma
16 * samples at one time. In the conversion equations
17 * R = Y + K1 * Cr
18 * G = Y + K2 * Cb + K3 * Cr
19 * B = Y + K4 * Cb
20 * only the Y term varies among the group of pixels corresponding to a pair
21 * of chroma samples, so the rest of the terms can be calculated just once.
22 * At typical sampling ratios, this eliminates half or three-quarters of the
23 * multiplications needed for color conversion.
25 * This file currently provides implementations for the following cases:
26 * YCbCr => RGB color conversion only.
27 * Sampling ratios of 2h1v or 2h2v.
28 * No scaling needed at upsample time.
29 * Corner-aligned (non-CCIR601) sampling alignment.
30 * Other special cases could be added, but in most applications these are
31 * the only common cases. (For uncommon cases we fall back on the more
32 * general code in jdsample.c and jdcolor.c.)
35 #define JPEG_INTERNALS
36 #include "jinclude.h"
37 #include "jpeglib.h"
39 #ifdef UPSAMPLE_MERGING_SUPPORTED
41 #ifdef HAVE_MMX_INTEL_MNEMONICS
42 __int64 const1 = 0x59BA0000D24B59BA; // Cr_r Cr_b Cr_g Cr_r
43 __int64 const2 = 0x00007168E9FA0000; // Cb-r Cb_b Cb_g Cb_r
44 __int64 const5 = 0x0000D24B59BA0000; // Cr_b Cr_g Cr_r Cr_b
45 __int64 const6 = 0x7168E9FA00007168; // Cb_b Cb_g Cb_r Cb_b
47 // constants for factors (One_Half/fix(x)) << 2
49 __int64 const05 = 0x0001000000000001; // Cr_r Cr_b Cr_g Cr_r
50 __int64 const15 = 0x00000001FFFA0000; // Cb-r Cb_b Cb_g Cb_r
51 __int64 const45 = 0x0000000000010000; // Cr_b Cr_g Cr_r Cr_b
52 __int64 const55 = 0x0001FFFA00000001; // Cb_b Cb_g Cb_r Cb_b
53 #endif
55 /* Private subobject */
57 typedef struct {
58 struct jpeg_upsampler pub; /* public fields */
60 /* Pointer to routine to do actual upsampling/conversion of one row group */
61 JMETHOD(void, upmethod, (j_decompress_ptr cinfo,
62 JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
63 JSAMPARRAY output_buf));
65 /* Private state for YCC->RGB conversion */
66 int * Cr_r_tab; /* => table for Cr to R conversion */
67 int * Cb_b_tab; /* => table for Cb to B conversion */
68 INT32 * Cr_g_tab; /* => table for Cr to G conversion */
69 INT32 * Cb_g_tab; /* => table for Cb to G conversion */
71 /* For 2:1 vertical sampling, we produce two output rows at a time.
72 * We need a "spare" row buffer to hold the second output row if the
73 * application provides just a one-row buffer; we also use the spare
74 * to discard the dummy last row if the image height is odd.
76 JSAMPROW spare_row;
77 boolean spare_full; /* T if spare buffer is occupied */
79 JDIMENSION out_row_width; /* samples per output row */
80 JDIMENSION rows_to_go; /* counts rows remaining in image */
81 } my_upsampler;
83 typedef my_upsampler * my_upsample_ptr;
85 #define SCALEBITS 16 /* speediest right-shift on some machines */
86 #define ONE_HALF ((INT32) 1 << (SCALEBITS-1))
87 #define FIX(x) ((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
91 * Initialize tables for YCC->RGB colorspace conversion.
92 * This is taken directly from jdcolor.c; see that file for more info.
95 LOCAL(void)
96 build_ycc_rgb_table (j_decompress_ptr cinfo)
98 my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
99 int i;
100 INT32 x;
101 SHIFT_TEMPS
103 upsample->Cr_r_tab = (int *)
104 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
105 (MAXJSAMPLE+1) * SIZEOF(int));
106 upsample->Cb_b_tab = (int *)
107 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
108 (MAXJSAMPLE+1) * SIZEOF(int));
109 upsample->Cr_g_tab = (INT32 *)
110 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
111 (MAXJSAMPLE+1) * SIZEOF(INT32));
112 upsample->Cb_g_tab = (INT32 *)
113 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
114 (MAXJSAMPLE+1) * SIZEOF(INT32));
116 for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
117 /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
118 /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
119 /* Cr=>R value is nearest int to 1.40200 * x */
120 upsample->Cr_r_tab[i] = (int)
121 RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
122 /* Cb=>B value is nearest int to 1.77200 * x */
123 upsample->Cb_b_tab[i] = (int)
124 RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
125 /* Cr=>G value is scaled-up -0.71414 * x */
126 upsample->Cr_g_tab[i] = (- FIX(0.71414)) * x;
127 /* Cb=>G value is scaled-up -0.34414 * x */
128 /* We also add in ONE_HALF so that need not do it in inner loop */
129 upsample->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
135 * Initialize for an upsampling pass.
138 METHODDEF(void)
139 start_pass_merged_upsample (j_decompress_ptr cinfo)
141 my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
143 /* Mark the spare buffer empty */
144 upsample->spare_full = FALSE;
145 /* Initialize total-height counter for detecting bottom of image */
146 upsample->rows_to_go = cinfo->output_height;
151 * Control routine to do upsampling (and color conversion).
153 * The control routine just handles the row buffering considerations.
156 METHODDEF(void)
157 merged_2v_upsample (j_decompress_ptr cinfo,
158 JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
159 JDIMENSION in_row_groups_avail,
160 JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
161 JDIMENSION out_rows_avail)
162 /* 2:1 vertical sampling case: may need a spare row. */
164 my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
165 JSAMPROW work_ptrs[2];
166 JDIMENSION num_rows; /* number of rows returned to caller */
168 if (upsample->spare_full) {
169 /* If we have a spare row saved from a previous cycle, just return it. */
170 jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
171 1, upsample->out_row_width);
172 num_rows = 1;
173 upsample->spare_full = FALSE;
174 } else {
175 /* Figure number of rows to return to caller. */
176 num_rows = 2;
177 /* Not more than the distance to the end of the image. */
178 if (num_rows > upsample->rows_to_go)
179 num_rows = upsample->rows_to_go;
180 /* And not more than what the client can accept: */
181 out_rows_avail -= *out_row_ctr;
182 if (num_rows > out_rows_avail)
183 num_rows = out_rows_avail;
184 /* Create output pointer array for upsampler. */
185 work_ptrs[0] = output_buf[*out_row_ctr];
186 if (num_rows > 1) {
187 work_ptrs[1] = output_buf[*out_row_ctr + 1];
188 } else {
189 work_ptrs[1] = upsample->spare_row;
190 upsample->spare_full = TRUE;
192 /* Now do the upsampling. */
193 (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs);
196 /* Adjust counts */
197 *out_row_ctr += num_rows;
198 upsample->rows_to_go -= num_rows;
199 /* When the buffer is emptied, declare this input row group consumed */
200 if (! upsample->spare_full)
201 (*in_row_group_ctr)++;
205 METHODDEF(void)
206 merged_1v_upsample (j_decompress_ptr cinfo,
207 JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
208 JDIMENSION in_row_groups_avail,
209 JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
210 JDIMENSION out_rows_avail)
211 /* 1:1 vertical sampling case: much easier, never need a spare row. */
213 my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
215 /* Just do the upsampling. */
216 (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr,
217 output_buf + *out_row_ctr);
218 /* Adjust counts */
219 (*out_row_ctr)++;
220 (*in_row_group_ctr)++;
225 * These are the routines invoked by the control routines to do
226 * the actual upsampling/conversion. One row group is processed per call.
228 * Note: since we may be writing directly into application-supplied buffers,
229 * we have to be honest about the output width; we can't assume the buffer
230 * has been rounded up to an even width.
235 * Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
238 METHODDEF(void)
239 h2v1_merged_upsample (j_decompress_ptr cinfo,
240 JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
241 JSAMPARRAY output_buf)
245 my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
246 register int y, cred, cgreen, cblue;
247 int cb, cr;
248 register JSAMPROW outptr;
249 JSAMPROW inptr0, inptr1, inptr2;
250 JDIMENSION col;
251 /* copy these pointers into registers if possible */
252 register JSAMPLE * range_limit = cinfo->sample_range_limit;
253 int * Crrtab = upsample->Cr_r_tab;
254 int * Cbbtab = upsample->Cb_b_tab;
255 INT32 * Crgtab = upsample->Cr_g_tab;
256 INT32 * Cbgtab = upsample->Cb_g_tab;
257 SHIFT_TEMPS
259 inptr0 = input_buf[0][in_row_group_ctr];
260 inptr1 = input_buf[1][in_row_group_ctr];
261 inptr2 = input_buf[2][in_row_group_ctr];
262 outptr = output_buf[0];
263 /* Loop for each pair of output pixels */
264 for (col = cinfo->output_width >> 1; col > 0; col--) {
265 /* Do the chroma part of the calculation */
266 cb = GETJSAMPLE(*inptr1++);
267 cr = GETJSAMPLE(*inptr2++);
268 cred = Crrtab[cr];
269 cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
270 cblue = Cbbtab[cb];
271 /* Fetch 2 Y values and emit 2 pixels */
272 y = GETJSAMPLE(*inptr0++);
273 outptr[RGB_RED] = range_limit[y + cred];
274 outptr[RGB_GREEN] = range_limit[y + cgreen];
275 outptr[RGB_BLUE] = range_limit[y + cblue];
276 outptr += RGB_PIXELSIZE;
277 y = GETJSAMPLE(*inptr0++);
278 outptr[RGB_RED] = range_limit[y + cred];
279 outptr[RGB_GREEN] = range_limit[y + cgreen];
280 outptr[RGB_BLUE] = range_limit[y + cblue];
281 outptr += RGB_PIXELSIZE;
283 /* If image width is odd, do the last output column separately */
284 if (cinfo->output_width & 1) {
285 cb = GETJSAMPLE(*inptr1);
286 cr = GETJSAMPLE(*inptr2);
287 cred = Crrtab[cr];
288 cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
289 cblue = Cbbtab[cb];
290 y = GETJSAMPLE(*inptr0);
291 outptr[RGB_RED] = range_limit[y + cred];
292 outptr[RGB_GREEN] = range_limit[y + cgreen];
293 outptr[RGB_BLUE] = range_limit[y + cblue];
299 * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
302 #ifdef HAVE_MMX_INTEL_MNEMONICS
303 __inline METHODDEF(void)
304 h2v2_merged_upsample_orig (j_decompress_ptr cinfo,
305 JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
306 JSAMPARRAY output_buf);
307 __inline METHODDEF(void)
308 h2v2_merged_upsample_mmx (j_decompress_ptr cinfo,
309 JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
310 JSAMPARRAY output_buf);
311 #endif
313 METHODDEF(void)
314 h2v2_merged_upsample (j_decompress_ptr cinfo,
315 JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
316 JSAMPARRAY output_buf);
318 #ifdef HAVE_MMX_INTEL_MNEMONICS
319 METHODDEF(void)
320 h2v2_merged_upsample (j_decompress_ptr cinfo,
321 JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
322 JSAMPARRAY output_buf)
324 if (MMXAvailable && (cinfo->image_width >= 8))
325 h2v2_merged_upsample_mmx (cinfo, input_buf, in_row_group_ctr, output_buf);
326 else
327 h2v2_merged_upsample_orig (cinfo, input_buf, in_row_group_ctr, output_buf);
331 __inline METHODDEF(void)
332 h2v2_merged_upsample_orig (j_decompress_ptr cinfo,
333 JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
334 JSAMPARRAY output_buf)
337 my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
338 register int y, cred, cgreen, cblue;
339 int cb, cr;
340 register JSAMPROW outptr0, outptr1;
341 JSAMPROW inptr00, inptr01, inptr1, inptr2;
342 JDIMENSION col;
343 /* copy these pointers into registers if possible */
344 register JSAMPLE * range_limit = cinfo->sample_range_limit;
345 int * Crrtab = upsample->Cr_r_tab;
346 int * Cbbtab = upsample->Cb_b_tab;
347 INT32 * Crgtab = upsample->Cr_g_tab;
348 INT32 * Cbgtab = upsample->Cb_g_tab;
349 SHIFT_TEMPS
351 inptr00 = input_buf[0][in_row_group_ctr*2];
352 inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
353 inptr1 = input_buf[1][in_row_group_ctr];
354 inptr2 = input_buf[2][in_row_group_ctr];
355 outptr0 = output_buf[0];
356 outptr1 = output_buf[1];
357 /* Loop for each group of output pixels */
358 for (col = cinfo->output_width >> 1; col > 0; col--) {
359 /* Do the chroma part of the calculation */
360 cb = GETJSAMPLE(*inptr1++);
361 cr = GETJSAMPLE(*inptr2++);
362 cred = Crrtab[cr];
363 cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
364 cblue = Cbbtab[cb];
365 /* Fetch 4 Y values and emit 4 pixels */
366 y = GETJSAMPLE(*inptr00++);
367 outptr0[RGB_RED] = range_limit[y + cred];
368 outptr0[RGB_GREEN] = range_limit[y + cgreen];
369 outptr0[RGB_BLUE] = range_limit[y + cblue];
370 outptr0 += RGB_PIXELSIZE;
371 y = GETJSAMPLE(*inptr00++);
372 outptr0[RGB_RED] = range_limit[y + cred];
373 outptr0[RGB_GREEN] = range_limit[y + cgreen];
374 outptr0[RGB_BLUE] = range_limit[y + cblue];
375 outptr0 += RGB_PIXELSIZE;
376 y = GETJSAMPLE(*inptr01++);
377 outptr1[RGB_RED] = range_limit[y + cred];
378 outptr1[RGB_GREEN] = range_limit[y + cgreen];
379 outptr1[RGB_BLUE] = range_limit[y + cblue];
380 outptr1 += RGB_PIXELSIZE;
381 y = GETJSAMPLE(*inptr01++);
382 outptr1[RGB_RED] = range_limit[y + cred];
383 outptr1[RGB_GREEN] = range_limit[y + cgreen];
384 outptr1[RGB_BLUE] = range_limit[y + cblue];
385 outptr1 += RGB_PIXELSIZE;
387 /* If image width is odd, do the last output column separately */
388 if (cinfo->output_width & 1) {
389 cb = GETJSAMPLE(*inptr1);
390 cr = GETJSAMPLE(*inptr2);
391 cred = Crrtab[cr];
392 cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
393 cblue = Cbbtab[cb];
394 y = GETJSAMPLE(*inptr00);
395 outptr0[RGB_RED] = range_limit[y + cred];
396 outptr0[RGB_GREEN] = range_limit[y + cgreen];
397 outptr0[RGB_BLUE] = range_limit[y + cblue];
398 y = GETJSAMPLE(*inptr01);
399 outptr1[RGB_RED] = range_limit[y + cred];
400 outptr1[RGB_GREEN] = range_limit[y + cgreen];
401 outptr1[RGB_BLUE] = range_limit[y + cblue];
406 * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
408 __inline METHODDEF(void)
409 h2v2_merged_upsample_mmx (j_decompress_ptr cinfo,
410 JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
411 JSAMPARRAY output_buf)
413 // added for MMX
414 __int64 const128 = 0x0080008000800080;
415 __int64 empty = 0x0000000000000000;
416 __int64 davemask = 0x0000FFFFFFFF0000;
417 ////////////////////////////////
419 my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
420 register int y, cred, cgreen, cblue;
421 int cb, cr;
422 register JSAMPROW outptr0, outptr1;
423 JSAMPROW inptr00, inptr01, inptr1, inptr2;
424 JDIMENSION col;
425 /* copy these pointers into registers if possible */
426 register JSAMPLE * range_limit = cinfo->sample_range_limit;
427 int * Crrtab = upsample->Cr_r_tab;
428 int * Cbbtab = upsample->Cb_b_tab;
429 INT32 * Crgtab = upsample->Cr_g_tab;
430 INT32 * Cbgtab = upsample->Cb_g_tab;
431 SHIFT_TEMPS
434 // Added for MMX
435 register int width = cinfo->image_width;
436 int cols = cinfo->output_width;
437 int cols_asm = (cols >> 3);
438 int diff = cols - (cols_asm<<3);
439 int cols_asm_copy = cols_asm;
441 ///////////////////////////////////////
443 inptr00 = input_buf[0][in_row_group_ctr*2];
444 inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
445 inptr1 = input_buf[1][in_row_group_ctr];
446 inptr2 = input_buf[2][in_row_group_ctr];
447 outptr0 = output_buf[0];
448 outptr1 = output_buf[1];
449 /* Loop for each group of output pixels */
452 _asm
454 mov esi, inptr00
456 mov eax, inptr01
458 mov ebx, inptr2
460 mov ecx, inptr1
462 mov edi, outptr0
464 mov edx, outptr1
466 do_next16:
468 movd mm0, [ebx] ; Cr7 Cr6.....Cr1 Cr0
470 pxor mm6, mm6
472 punpcklbw mm0, mm0 ; Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0
474 movq mm7, const128
476 punpcklwd mm0, mm0 ; Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0
478 movq mm4, mm0
480 punpcklbw mm0, mm6 ; Cr0 Cr0 Cr0 Cr0
482 psubsw mm0, mm7 ; Cr0 - 128:Cr0-128:Cr0-128:Cr0 -128
484 movd mm1, [ecx] ; Cb7 Cb6...... Cb1 Cb0
486 psllw mm0, 2 ; left shift by 2 bits
488 punpcklbw mm1, mm1 ; Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0
490 paddsw mm0, const05 ; add (one_half/fix(x)) << 2
492 punpcklwd mm1, mm1 ; Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0
494 movq mm5, mm1
496 pmulhw mm0, const1 ; multiply by (fix(x) >> 1)
498 punpcklbw mm1, mm6 ; Cb0 Cb0 Cb0 Cb0
500 punpckhbw mm4, mm6 ; Cr1 Cr1 Cr1 Cr1
502 psubsw mm1, mm7 ; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
504 punpckhbw mm5, mm6 ; Cb1 Cb1 Cb1 Cb1
506 psllw mm1, 2 ; left shift by 2 bits
508 paddsw mm1, const15 ; add (one_half/fix(x)) << 2
510 psubsw mm4, mm7 ; Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128
512 psubsw mm5, mm7 ; Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128
514 pmulhw mm1, const2 ; multiply by (fix(x) >> 1)
516 psllw mm4, 2 ; left shift by 2 bits
518 psllw mm5, 2 ; left shift by 2 bits
520 paddsw mm4, const45 ; add (one_half/fix(x)) << 2
522 movd mm7, [esi] ; Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0
524 pmulhw mm4, const5 ; multiply by (fix(x) >> 1)
526 movq mm6, mm7
528 punpcklbw mm7, mm7 ; Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0
530 paddsw mm5, const55 ; add (one_half/fix(x)) << 2
532 paddsw mm0, mm1 ; cred0 cbl0 cgr0 cred0
534 movq mm1, mm7
536 pmulhw mm5, const6 ; multiply by (fix(x) >> 1)
538 movq mm2, mm0 ; cred0 cbl0 cgr0 cred0
540 punpcklwd mm7, mm6 ; Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0
542 pand mm2, davemask ; 0 cbl0 cgr0 0
544 psrlq mm1, 16 ; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
546 psrlq mm2, 16 ; 0 0 cbl0 cgr0
548 punpcklbw mm7, empty ; Y1 Y0 Y0 Y0
550 paddsw mm4, mm5 ; cbl1 cgr1 cred1 cbl1
552 movq mm3, mm4 ; cbl1 cgr1 cred1 cbl1
554 pand mm3, davemask ; 0 cgr1 cred1 0
556 paddsw mm7, mm0 ; r1 b0 g0 r0
558 psllq mm3, 16 ; cgr1 cred1 0 0
560 movq mm6, mm1 ; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
562 por mm2, mm3 ; cgr1 cred1 cbl0 cgr0
564 punpcklbw mm6, empty ; Y4 Y4 Y1 Y1
566 movd mm3, [eax] ; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
568 paddsw mm6, mm2 ; g4 r4 b1 g1
570 packuswb mm7, mm6 ; g4 r4 b1 g1 r1 b0 g0 r0
572 movq mm6, mm3 ; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
574 punpcklbw mm3, mm3 ; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
576 movq [edi], mm7 ; move to memory g4 r4 b1 g1 r1 b0 g0 r0
578 movq mm5, mm3 ; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
580 punpcklwd mm3, mm6 ; X X X X Y3 Y2 Y2 Y2
582 punpcklbw mm3, empty ; Y3 Y2 Y2 Y2
584 psrlq mm5, 16 ; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
586 paddsw mm3, mm0 ; r3 b2 g2 r2
588 movq mm6, mm5 ; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
590 movq mm0, mm1 ; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
592 punpckldq mm6, mm6 ; X X X X Y6 Y6 Y3 Y3
594 punpcklbw mm6, empty ; Y6 Y6 Y3 Y3
596 psrlq mm1, 24 ; 0 0 0 0 0 Y5 Y5 Y4
598 paddsw mm6, mm2 ; g6 r6 b3 g3
600 packuswb mm3, mm6 ; g6 r6 b3 g3 r3 b2 g2 r2
602 movq mm2, mm5 ; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
604 psrlq mm0, 32 ; 0 0 0 0 0 0 Y5 Y5
606 movq [edx], mm3 ; move to memory g6 r6 b3 g3 r3 b2 g2 r2
608 punpcklwd mm1, mm0 ; X X X X Y5 Y5 Y5 Y4
610 psrlq mm5, 24 ; 0 0 0 0 0 Y7 Y7 Y6
612 movd mm0, [ebx] ; Cr9 Cr8.....Cr3 Cr2
614 psrlq mm2, 32 ; 0 0 0 0 0 0 Y7 Y7
616 psrlq mm0, 16
618 punpcklbw mm1, empty ; Y5 Y5 Y5 Y4
620 punpcklwd mm5, mm2 ; X X X X Y7 Y7 Y7 Y6
622 paddsw mm1, mm4 ; b5 g5 r5 b4
624 punpcklbw mm5, empty ; Y7 Y7 Y7 Y6
626 pxor mm6, mm6 ; clear mm6 registr
628 punpcklbw mm0, mm0 ; X X X X Cr3 Cr3 Cr2 Cr2
630 paddsw mm5, mm4 ; b7 g7 r7 b6
632 punpcklwd mm0, mm0 ; Cr3 Cr3 Cr3 Cr3 Cr2 Cr2 Cr2 Cr2
634 movq mm4, mm0
636 movd mm3, [ecx] ; Cb9 Cb8...... Cb3 Cb2
638 punpcklbw mm0, mm6 ; Cr2 Cr2 Cr2 Cr2
640 psrlq mm3, 16
642 psubsw mm0, const128 ; Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128
644 punpcklbw mm3, mm3 ; X X X X Cb3 Cb3 Cb2 Cb2
646 psllw mm0, 2 ; left shift by 2 bits
648 paddsw mm0, const05 ; add (one_half/fix(x)) << 2
650 punpcklwd mm3, mm3 ; Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2
652 movq mm7, mm3
654 pmulhw mm0, const1 ; multiply by (fix(x) >> 1)
656 punpcklbw mm3, mm6 ; Cb2 Cb2 Cb2 Cb2
658 psubsw mm3, const128 ; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
660 punpckhbw mm4, mm6 ; Cr3 Cr3 Cr3 Cr3
662 psllw mm3, 2 ; left shift by 2 bits
664 paddsw mm3, const15 ; add (one_half/fix(x)) << 2
666 punpckhbw mm7, mm6 ; Cb3 Cb3 Cb3 Cb3
668 pmulhw mm3, const2 ; multiply by (fix(x) >> 1)
670 psubsw mm7, const128 ; Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128
672 paddsw mm0, mm3 ; cred2 cbl2 cgr2 cred2
674 psllw mm7, 2 ; left shift by 2 bits
676 psubsw mm4, const128 ; Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128
678 movd mm3, [esi+4] ; Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
680 psllw mm4, 2 ; left shift by 2 bits
682 paddsw mm7, const55 ; add (one_half/fix(x)) << 2
684 movq mm6, mm3 ; Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
686 movq mm2, mm0
688 pand mm2, davemask
690 punpcklbw mm3, mm3 ; Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8
692 psrlq mm2, 16
694 paddsw mm4, const45 ; add (one_half/fix(x)) << 2
696 punpcklwd mm3, mm6 ; X X X X Y9 Y8 Y8 Y8
698 pmulhw mm4, const5 ; multiply by (fix(x) >> 1)
700 pmulhw mm7, const6 ; multiply by (fix(x) >> 1)
702 punpcklbw mm3, empty ; Y9 Y8 Y8 Y8
704 paddsw mm4, mm7 ; cbl3 cgr3 cred3 cbl3
706 paddsw mm3, mm0 ; r9 b8 g8 r8
708 movq mm7, mm4
710 packuswb mm1, mm3 ; r9 b8 g8 r8 b5 g5 r5 b4
712 movd mm3, [eax+4] ; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
714 pand mm7, davemask
716 psrlq mm6, 8 ; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
718 psllq mm7, 16
720 movq [edi+8], mm1 ; move to memory r9 b8 g8 r8 b5 g5 r5 b4
722 por mm2, mm7
724 movq mm7, mm3 ; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
726 punpcklbw mm3, mm3 ; X X X X Y11 Y11 Y10 Y10
728 pxor mm1, mm1
730 punpcklwd mm3, mm7 ; X X X X Y11 Y10 Y10 Y10
732 punpcklbw mm3, mm1 ; Y11 Y10 Y10 Y10
734 psrlq mm7, 8 ; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
736 paddsw mm3, mm0 ; r11 b10 g10 r10
738 movq mm0, mm7 ; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
740 packuswb mm5, mm3 ; r11 b10 g10 r10 b7 g7 r7 b6
742 punpcklbw mm7, mm7 ; X X X X Y14 Y14 Y11 Y11
744 movq [edx+8], mm5 ; move to memory r11 b10 g10 r10 b7 g7 r7 b6
746 movq mm3, mm6 ; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
748 punpcklbw mm6, mm6 ; X X X X Y12 Y12 Y9 Y9
750 punpcklbw mm7, mm1 ; Y14 Y14 Y11 Y11
752 punpcklbw mm6, mm1 ; Y12 Y12 Y9 Y9
754 paddsw mm7, mm2 ; g14 r14 b11 g11
756 paddsw mm6, mm2 ; g12 r12 b9 g9
758 psrlq mm3, 8 ; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
760 movq mm1, mm3 ; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
762 punpcklbw mm3, mm3 ; X X X X Y13 Y13 Y12 Y12
764 add esi, 8
766 psrlq mm3, 16 ; X X X X X X Y13 Y13 modified on 09/24
768 punpcklwd mm1, mm3 ; X X X X Y13 Y13 Y13 Y12
770 add eax, 8
772 psrlq mm0, 8 ; 0 0 Y23 Y22 Y19 Y18 Y15 Y14
774 punpcklbw mm1, empty ; Y13 Y13 Y13 Y12
776 movq mm5, mm0 ; 0 0 Y23 Y22 Y19 Y18 Y15 Y14
778 punpcklbw mm0, mm0 ; X X X X Y15 Y15 Y14 Y14
780 paddsw mm1, mm4 ; b13 g13 r13 b12
782 psrlq mm0, 16 ; X X X X X X Y15 Y15
784 add edi, 24
786 punpcklwd mm5, mm0 ; X X X X Y15 Y15 Y15 Y14
788 packuswb mm6, mm1 ; b13 g13 r13 b12 g12 r12 b9 g9
790 add edx, 24
792 punpcklbw mm5, empty ; Y15 Y15 Y15 Y14
794 add ebx, 4
796 paddsw mm5, mm4 ; b15 g15 r15 b14
798 movq [edi-8], mm6 ; move to memory b13 g13 r13 b12 g12 r12 b9 g9
800 packuswb mm7, mm5 ; b15 g15 r15 b14 g14 r14 b11 g11
802 add ecx, 4
804 movq [edx-8], mm7 ; move to memory b15 g15 r15 b14 g14 r14 b11 g11
806 dec cols_asm
808 jnz do_next16
810 EMMS
815 inptr1 += (cols_asm_copy<<2);
817 inptr2 += (cols_asm_copy<<2);
819 inptr00 += (cols_asm_copy<<3);
821 inptr01 += (cols_asm_copy<<3);
823 outptr0 += cols_asm_copy*24;
825 outptr1 += cols_asm_copy*24;
827 //for (col = cinfo->output_width >> 1; col > 0; col--) {
828 /* Do the chroma part of the calculation */
829 /*cb = GETJSAMPLE(*inptr1++);
830 cr = GETJSAMPLE(*inptr2++);
831 cred = Crrtab[cr];
832 cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
833 cblue = Cbbtab[cb];*/
834 /* Fetch 4 Y values and emit 4 pixels */
835 /*y = GETJSAMPLE(*inptr00++);
836 outptr0[RGB_RED] = range_limit[y + cred];
837 outptr0[RGB_GREEN] = range_limit[y + cgreen];
838 outptr0[RGB_BLUE] = range_limit[y + cblue];
839 outptr0 += RGB_PIXELSIZE;
840 y = GETJSAMPLE(*inptr00++);
841 outptr0[RGB_RED] = range_limit[y + cred];
842 outptr0[RGB_GREEN] = range_limit[y + cgreen];
843 outptr0[RGB_BLUE] = range_limit[y + cblue];
844 outptr0 += RGB_PIXELSIZE;
845 y = GETJSAMPLE(*inptr01++);
846 outptr1[RGB_RED] = range_limit[y + cred];
847 outptr1[RGB_GREEN] = range_limit[y + cgreen];
848 outptr1[RGB_BLUE] = range_limit[y + cblue];
849 outptr1 += RGB_PIXELSIZE;
850 y = GETJSAMPLE(*inptr01++);
851 outptr1[RGB_RED] = range_limit[y + cred];
852 outptr1[RGB_GREEN] = range_limit[y + cgreen];
853 outptr1[RGB_BLUE] = range_limit[y + cblue];
854 outptr1 += RGB_PIXELSIZE;
855 } */
858 for (col = diff >> 1; col > 0; col--) {
859 /* Do the chroma part of the calculation */
860 cb = GETJSAMPLE(*inptr1++);
861 cr = GETJSAMPLE(*inptr2++);
862 cred = Crrtab[cr];
863 cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
864 cblue = Cbbtab[cb];
865 /* Fetch 4 Y values and emit 4 pixels */
866 y = GETJSAMPLE(*inptr00++);
867 outptr0[RGB_RED] = range_limit[y + cred];
868 outptr0[RGB_GREEN] = range_limit[y + cgreen];
869 outptr0[RGB_BLUE] = range_limit[y + cblue];
870 outptr0 += RGB_PIXELSIZE;
871 y = GETJSAMPLE(*inptr00++);
872 outptr0[RGB_RED] = range_limit[y + cred];
873 outptr0[RGB_GREEN] = range_limit[y + cgreen];
874 outptr0[RGB_BLUE] = range_limit[y + cblue];
875 outptr0 += RGB_PIXELSIZE;
876 y = GETJSAMPLE(*inptr01++);
877 outptr1[RGB_RED] = range_limit[y + cred];
878 outptr1[RGB_GREEN] = range_limit[y + cgreen];
879 outptr1[RGB_BLUE] = range_limit[y + cblue];
880 outptr1 += RGB_PIXELSIZE;
881 y = GETJSAMPLE(*inptr01++);
882 outptr1[RGB_RED] = range_limit[y + cred];
883 outptr1[RGB_GREEN] = range_limit[y + cgreen];
884 outptr1[RGB_BLUE] = range_limit[y + cblue];
885 outptr1 += RGB_PIXELSIZE;
889 /* If image width is odd, do the last output column separately */
890 //if (cinfo->output_width & 1) {
891 if (diff & 1) {
892 cb = GETJSAMPLE(*inptr1);
893 cr = GETJSAMPLE(*inptr2);
894 cred = Crrtab[cr];
895 cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
896 cblue = Cbbtab[cb];
897 y = GETJSAMPLE(*inptr00);
898 outptr0[RGB_RED] = range_limit[y + cred];
899 outptr0[RGB_GREEN] = range_limit[y + cgreen];
900 outptr0[RGB_BLUE] = range_limit[y + cblue];
901 y = GETJSAMPLE(*inptr01);
902 outptr1[RGB_RED] = range_limit[y + cred];
903 outptr1[RGB_GREEN] = range_limit[y + cgreen];
904 outptr1[RGB_BLUE] = range_limit[y + cblue];
907 #else
910 METHODDEF(void)
911 h2v2_merged_upsample (j_decompress_ptr cinfo,
912 JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
913 JSAMPARRAY output_buf)
915 my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
916 register int y, cred, cgreen, cblue;
917 int cb, cr;
918 register JSAMPROW outptr0, outptr1;
919 JSAMPROW inptr00, inptr01, inptr1, inptr2;
920 JDIMENSION col;
921 /* copy these pointers into registers if possible */
922 register JSAMPLE * range_limit = cinfo->sample_range_limit;
923 int * Crrtab = upsample->Cr_r_tab;
924 int * Cbbtab = upsample->Cb_b_tab;
925 INT32 * Crgtab = upsample->Cr_g_tab;
926 INT32 * Cbgtab = upsample->Cb_g_tab;
927 SHIFT_TEMPS
929 inptr00 = input_buf[0][in_row_group_ctr*2];
930 inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
931 inptr1 = input_buf[1][in_row_group_ctr];
932 inptr2 = input_buf[2][in_row_group_ctr];
933 outptr0 = output_buf[0];
934 outptr1 = output_buf[1];
935 /* Loop for each group of output pixels */
936 for (col = cinfo->output_width >> 1; col > 0; col--) {
937 /* Do the chroma part of the calculation */
938 cb = GETJSAMPLE(*inptr1++);
939 cr = GETJSAMPLE(*inptr2++);
940 cred = Crrtab[cr];
941 cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
942 cblue = Cbbtab[cb];
943 /* Fetch 4 Y values and emit 4 pixels */
944 y = GETJSAMPLE(*inptr00++);
945 outptr0[RGB_RED] = range_limit[y + cred];
946 outptr0[RGB_GREEN] = range_limit[y + cgreen];
947 outptr0[RGB_BLUE] = range_limit[y + cblue];
948 outptr0 += RGB_PIXELSIZE;
949 y = GETJSAMPLE(*inptr00++);
950 outptr0[RGB_RED] = range_limit[y + cred];
951 outptr0[RGB_GREEN] = range_limit[y + cgreen];
952 outptr0[RGB_BLUE] = range_limit[y + cblue];
953 outptr0 += RGB_PIXELSIZE;
954 y = GETJSAMPLE(*inptr01++);
955 outptr1[RGB_RED] = range_limit[y + cred];
956 outptr1[RGB_GREEN] = range_limit[y + cgreen];
957 outptr1[RGB_BLUE] = range_limit[y + cblue];
958 outptr1 += RGB_PIXELSIZE;
959 y = GETJSAMPLE(*inptr01++);
960 outptr1[RGB_RED] = range_limit[y + cred];
961 outptr1[RGB_GREEN] = range_limit[y + cgreen];
962 outptr1[RGB_BLUE] = range_limit[y + cblue];
963 outptr1 += RGB_PIXELSIZE;
965 /* If image width is odd, do the last output column separately */
966 if (cinfo->output_width & 1) {
967 cb = GETJSAMPLE(*inptr1);
968 cr = GETJSAMPLE(*inptr2);
969 cred = Crrtab[cr];
970 cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
971 cblue = Cbbtab[cb];
972 y = GETJSAMPLE(*inptr00);
973 outptr0[RGB_RED] = range_limit[y + cred];
974 outptr0[RGB_GREEN] = range_limit[y + cgreen];
975 outptr0[RGB_BLUE] = range_limit[y + cblue];
976 y = GETJSAMPLE(*inptr01);
977 outptr1[RGB_RED] = range_limit[y + cred];
978 outptr1[RGB_GREEN] = range_limit[y + cgreen];
979 outptr1[RGB_BLUE] = range_limit[y + cblue];
982 #endif
986 * Module initialization routine for merged upsampling/color conversion.
988 * NB: this is called under the conditions determined by use_merged_upsample()
989 * in jdmaster.c. That routine MUST correspond to the actual capabilities
990 * of this module; no safety checks are made here.
993 GLOBAL(void)
994 jinit_merged_upsampler (j_decompress_ptr cinfo)
996 my_upsample_ptr upsample;
998 upsample = (my_upsample_ptr)
999 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
1000 SIZEOF(my_upsampler));
1001 cinfo->upsample = (struct jpeg_upsampler *) upsample;
1002 upsample->pub.start_pass = start_pass_merged_upsample;
1003 upsample->pub.need_context_rows = FALSE;
1005 upsample->out_row_width = cinfo->output_width * cinfo->out_color_components;
1007 if (cinfo->max_v_samp_factor == 2) {
1008 upsample->pub.upsample = merged_2v_upsample;
1009 upsample->upmethod = h2v2_merged_upsample;
1010 /* Allocate a spare row buffer */
1011 upsample->spare_row = (JSAMPROW)
1012 (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
1013 (size_t) (upsample->out_row_width * SIZEOF(JSAMPLE)));
1014 } else {
1015 upsample->pub.upsample = merged_1v_upsample;
1016 upsample->upmethod = h2v1_merged_upsample;
1017 /* No spare row needed */
1018 upsample->spare_row = NULL;
1021 build_ycc_rgb_table(cinfo);
1024 #endif /* UPSAMPLE_MERGING_SUPPORTED */