r999: maintainers added to README_en.
[cinelerra_cv/mob.git] / quicktime / rtjpeg_core.c
blob1b7441fe1027c58216e5a8e8f92d4e0b95f94191
1 /*
2 bttvgrab 0.15.4 [1999-03-23]
3 (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
5 Maintained by: Joerg Walter
6 Current version at http:/*moes.pmnet.uni-oldenburg.de/bttvgrab/ */
8 This program is free software; you can rquantptr it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 This file is a modified version of RTjpeg 0.1.2, (C) Justin Schoeman 1998
28 Main Routines
30 This file contains most of the initialisation and control functions
32 (C) Justin Schoeman 1998
36 #include <sys/types.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include "rtjpeg_core.h"
42 static const unsigned char RTjpeg_ZZ[64]={
44 8, 1,
45 2, 9, 16,
46 24, 17, 10, 3,
47 4, 11, 18, 25, 32,
48 40, 33, 26, 19, 12, 5,
49 6, 13, 20, 27, 34, 41, 48,
50 56, 49, 42, 35, 28, 21, 14, 7,
51 15, 22, 29, 36, 43, 50, 57,
52 58, 51, 44, 37, 30, 23,
53 31, 38, 45, 52, 59,
54 60, 53, 46, 39,
55 47, 54, 61,
56 62, 55,
57 63 };
59 static const __u64 RTjpeg_aan_tab[64]={
60 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
61 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
62 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
63 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
64 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
65 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
66 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
67 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
70 static const unsigned char RTjpeg_lum_quant_tbl[64] = {
71 16, 11, 10, 16, 24, 40, 51, 61,
72 12, 12, 14, 19, 26, 58, 60, 55,
73 14, 13, 16, 24, 40, 57, 69, 56,
74 14, 17, 22, 29, 51, 87, 80, 62,
75 18, 22, 37, 56, 68, 109, 103, 77,
76 24, 35, 55, 64, 81, 104, 113, 92,
77 49, 64, 78, 87, 103, 121, 120, 101,
78 72, 92, 95, 98, 112, 100, 103, 99
81 static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
82 17, 18, 24, 47, 99, 99, 99, 99,
83 18, 21, 26, 66, 99, 99, 99, 99,
84 24, 26, 56, 99, 99, 99, 99, 99,
85 47, 66, 99, 99, 99, 99, 99, 99,
86 99, 99, 99, 99, 99, 99, 99, 99,
87 99, 99, 99, 99, 99, 99, 99, 99,
88 99, 99, 99, 99, 99, 99, 99, 99,
89 99, 99, 99, 99, 99, 99, 99, 99
92 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
94 register int ci, co=1, tmp;
95 register __s16 ZZvalue;
97 (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
99 for(ci=1; ci<=bt8; ci++)
101 ZZvalue = data[RTjpeg_ZZ[ci]];
103 if(ZZvalue>0)
105 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
107 else
109 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
113 for(; ci<64; ci++)
115 ZZvalue = data[RTjpeg_ZZ[ci]];
117 if(ZZvalue>0)
119 strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue;
121 else if(ZZvalue<0)
123 strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue;
125 else /* compress zeros */
127 tmp=ci;
130 ci++;
132 while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
134 strm[co++]=(__s8)(63+(ci-tmp));
135 ci--;
138 return (int)co;
141 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
143 int ci=1, co=1, tmp;
144 register int i;
146 i=RTjpeg_ZZ[0];
147 data[i]=((__u8)strm[0])*qtbl[i];
149 for(co=1; co<=bt8; co++)
151 i=RTjpeg_ZZ[co];
152 data[i]=strm[ci++]*qtbl[i];
155 for(; co<64; co++)
157 if(strm[ci]>63)
159 tmp=co+strm[ci]-63;
160 for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
161 co--;
162 } else
164 i=RTjpeg_ZZ[co];
165 data[i]=strm[ci]*qtbl[i];
167 ci++;
169 return (int)ci;
172 #if defined(USE_MMX)
173 void RTjpeg_quant_init(void)
175 int i;
176 __s16 *qtbl;
178 qtbl=(__s16 *)RTjpeg_lqt;
179 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i];
181 qtbl=(__s16 *)RTjpeg_cqt;
182 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i];
185 static mmx_t RTjpeg_ones=(mmx_t)(long long)0x0001000100010001LL;
186 static mmx_t RTjpeg_half=(mmx_t)(long long)0x7fff7fff7fff7fffLL;
188 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
190 int i;
191 mmx_t *bl, *ql;
193 ql=(mmx_t *)qtbl;
194 bl=(mmx_t *)block;
196 movq_m2r(RTjpeg_ones, mm6);
197 movq_m2r(RTjpeg_half, mm7);
199 for(i=16; i; i--)
201 movq_m2r(*(ql++), mm0); /* quant vals (4) */
202 movq_m2r(*bl, mm2); /* block vals (4) */
203 movq_r2r(mm0, mm1);
204 movq_r2r(mm2, mm3);
206 punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
207 punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
209 punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
210 punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
212 pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
213 pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
215 psrad_i2r(16, mm0);
216 psrad_i2r(16, mm1);
218 packssdw_r2r(mm1, mm0);
220 movq_r2m(mm0, *(bl++));
224 #else
225 void RTjpeg_quant_init(void)
229 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
231 int i;
233 for(i=0; i<64; i++)
234 block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16);
236 #endif
239 * Perform the forward DCT on one block of samples.
241 #ifdef USE_MMX
242 static mmx_t RTjpeg_C4 =(mmx_t)(long long)0x2D412D412D412D41LL;
243 static mmx_t RTjpeg_C6 =(mmx_t)(long long)0x187E187E187E187ELL;
244 static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL;
245 static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL;
246 static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL;
248 #else
250 #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */
251 #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */
252 #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */
253 #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */
255 #define DESCALE10(x) (__s16)( ((x)+128) >> 8)
256 #define DESCALE20(x) (__s16)(((x)+32768) >> 16)
257 #define D_MULTIPLY(var,const) ((__s32) ((var) * (const)))
258 #endif
260 void RTjpeg_dct_init(void)
262 int i;
264 for(i=0; i<64; i++)
266 RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]);
267 RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]);
271 void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip)
273 #ifndef MMX
274 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
275 __s32 tmp10, tmp11, tmp12, tmp13;
276 __s32 z1, z2, z3, z4, z5, z11, z13;
277 __u8 *idataptr;
278 __s16 *odataptr;
279 __s32 *wsptr;
280 int ctr;
282 idataptr = idata;
283 wsptr = RTjpeg_ws;
284 for (ctr = 7; ctr >= 0; ctr--) {
285 tmp0 = idataptr[0] + idataptr[7];
286 tmp7 = idataptr[0] - idataptr[7];
287 tmp1 = idataptr[1] + idataptr[6];
288 tmp6 = idataptr[1] - idataptr[6];
289 tmp2 = idataptr[2] + idataptr[5];
290 tmp5 = idataptr[2] - idataptr[5];
291 tmp3 = idataptr[3] + idataptr[4];
292 tmp4 = idataptr[3] - idataptr[4];
294 tmp10 = (tmp0 + tmp3); /* phase 2 */
295 tmp13 = tmp0 - tmp3;
296 tmp11 = (tmp1 + tmp2);
297 tmp12 = tmp1 - tmp2;
299 wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
300 wsptr[4] = (tmp10 - tmp11)<<8;
302 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
303 wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
304 wsptr[6] = (tmp13<<8) - z1;
306 tmp10 = tmp4 + tmp5; /* phase 2 */
307 tmp11 = tmp5 + tmp6;
308 tmp12 = tmp6 + tmp7;
310 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
311 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
312 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
313 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
315 z11 = (tmp7<<8) + z3; /* phase 5 */
316 z13 = (tmp7<<8) - z3;
318 wsptr[5] = z13 + z2; /* phase 6 */
319 wsptr[3] = z13 - z2;
320 wsptr[1] = z11 + z4;
321 wsptr[7] = z11 - z4;
323 idataptr += rskip<<3; /* advance pointer to next row */
324 wsptr += 8;
327 wsptr = RTjpeg_ws;
328 odataptr=odata;
329 for (ctr = 7; ctr >= 0; ctr--) {
330 tmp0 = wsptr[0] + wsptr[56];
331 tmp7 = wsptr[0] - wsptr[56];
332 tmp1 = wsptr[8] + wsptr[48];
333 tmp6 = wsptr[8] - wsptr[48];
334 tmp2 = wsptr[16] + wsptr[40];
335 tmp5 = wsptr[16] - wsptr[40];
336 tmp3 = wsptr[24] + wsptr[32];
337 tmp4 = wsptr[24] - wsptr[32];
339 tmp10 = tmp0 + tmp3; /* phase 2 */
340 tmp13 = tmp0 - tmp3;
341 tmp11 = tmp1 + tmp2;
342 tmp12 = tmp1 - tmp2;
344 odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
345 odataptr[32] = DESCALE10(tmp10 - tmp11);
347 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
348 odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
349 odataptr[48] = DESCALE20((tmp13<<8) - z1);
351 tmp10 = tmp4 + tmp5; /* phase 2 */
352 tmp11 = tmp5 + tmp6;
353 tmp12 = tmp6 + tmp7;
355 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
356 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
357 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
358 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
360 z11 = (tmp7<<8) + z3; /* phase 5 */
361 z13 = (tmp7<<8) - z3;
363 odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
364 odataptr[24] = DESCALE20(z13 - z2);
365 odataptr[8] = DESCALE20(z11 + z4);
366 odataptr[56] = DESCALE20(z11 - z4);
368 odataptr++; /* advance pointer to next column */
369 wsptr++;
371 #else
372 mmx_t tmp6, tmp7;
373 register mmx_t *dataptr = (mmx_t *)odata;
374 mmx_t *idata2 = (mmx_t *)idata;
376 /* first copy the input 8 bit to the destination 16 bits */
378 movq_m2r(RTjpeg_zero, mm2);
381 movq_m2r(*idata2, mm0);
382 movq_r2r(mm0, mm1);
384 punpcklbw_r2r(mm2, mm0);
385 movq_r2m(mm0, *(dataptr));
387 punpckhbw_r2r(mm2, mm1);
388 movq_r2m(mm1, *(dataptr+1));
390 idata2 += rskip;
392 movq_m2r(*idata2, mm0);
393 movq_r2r(mm0, mm1);
395 punpcklbw_r2r(mm2, mm0);
396 movq_r2m(mm0, *(dataptr+2));
398 punpckhbw_r2r(mm2, mm1);
399 movq_r2m(mm1, *(dataptr+3));
401 idata2 += rskip;
403 movq_m2r(*idata2, mm0);
404 movq_r2r(mm0, mm1);
406 punpcklbw_r2r(mm2, mm0);
407 movq_r2m(mm0, *(dataptr+4));
409 punpckhbw_r2r(mm2, mm1);
410 movq_r2m(mm1, *(dataptr+5));
412 idata2 += rskip;
414 movq_m2r(*idata2, mm0);
415 movq_r2r(mm0, mm1);
417 punpcklbw_r2r(mm2, mm0);
418 movq_r2m(mm0, *(dataptr+6));
420 punpckhbw_r2r(mm2, mm1);
421 movq_r2m(mm1, *(dataptr+7));
423 idata2 += rskip;
425 movq_m2r(*idata2, mm0);
426 movq_r2r(mm0, mm1);
428 punpcklbw_r2r(mm2, mm0);
429 movq_r2m(mm0, *(dataptr+8));
431 punpckhbw_r2r(mm2, mm1);
432 movq_r2m(mm1, *(dataptr+9));
434 idata2 += rskip;
436 movq_m2r(*idata2, mm0);
437 movq_r2r(mm0, mm1);
439 punpcklbw_r2r(mm2, mm0);
440 movq_r2m(mm0, *(dataptr+10));
442 punpckhbw_r2r(mm2, mm1);
443 movq_r2m(mm1, *(dataptr+11));
445 idata2 += rskip;
447 movq_m2r(*idata2, mm0);
448 movq_r2r(mm0, mm1);
450 punpcklbw_r2r(mm2, mm0);
451 movq_r2m(mm0, *(dataptr+12));
453 punpckhbw_r2r(mm2, mm1);
454 movq_r2m(mm1, *(dataptr+13));
456 idata2 += rskip;
458 movq_m2r(*idata2, mm0);
459 movq_r2r(mm0, mm1);
461 punpcklbw_r2r(mm2, mm0);
462 movq_r2m(mm0, *(dataptr+14));
464 punpckhbw_r2r(mm2, mm1);
465 movq_r2m(mm1, *(dataptr+15));
467 /* Start Transpose to do calculations on rows */
469 movq_m2r(*(dataptr+9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into m5 */
471 movq_m2r(*(dataptr+13), mm6); /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */
472 movq_r2r(mm7, mm5);
474 punpcklwd_m2r(*(dataptr+11), mm7); /* m11:m01|m10:m00 - interleave first and second lines */
475 movq_r2r(mm6, mm2);
477 punpcklwd_m2r(*(dataptr+15), mm6); /* m31:m21|m30:m20 - interleave third and fourth lines */
478 movq_r2r(mm7, mm1);
480 movq_m2r(*(dataptr+11), mm3); /* m13:m13|m11:m10 - second line */
481 punpckldq_r2r(mm6, mm7); /* m30:m20|m10:m00 - interleave to produce result 1 */
483 movq_m2r(*(dataptr+15), mm0); /* m13:m13|m11:m10 - fourth line */
484 punpckhdq_r2r(mm6, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */
486 movq_r2m(mm7,*(dataptr+9)); /* write result 1 */
487 punpckhwd_r2r(mm3, mm5); /* m13:m03|m12:m02 - interleave first and second lines */
489 movq_r2m(mm1,*(dataptr+11)); /* write result 2 */
490 punpckhwd_r2r(mm0, mm2); /* m33:m23|m32:m22 - interleave third and fourth lines */
492 movq_r2r(mm5, mm1);
493 punpckldq_r2r(mm2, mm5); /* m32:m22|m12:m02 - interleave to produce result 3 */
495 movq_m2r(*(dataptr+1), mm0); /* m03:m02|m01:m00 - first line, 4x4 */
496 punpckhdq_r2r(mm2, mm1); /* m33:m23|m13:m03 - interleave to produce result 4 */
498 movq_r2m(mm5,*(dataptr+13)); /* write result 3 */
500 /* last 4x4 done */
502 movq_r2m(mm1, *(dataptr+15)); /* write result 4, last 4x4 */
504 movq_m2r(*(dataptr+5), mm2); /* m23:m22|m21:m20 - third line */
505 movq_r2r(mm0, mm6);
507 punpcklwd_m2r(*(dataptr+3), mm0); /* m11:m01|m10:m00 - interleave first and second lines */
508 movq_r2r(mm2, mm7);
510 punpcklwd_m2r(*(dataptr+7), mm2); /* m31:m21|m30:m20 - interleave third and fourth lines */
511 movq_r2r(mm0, mm4);
513 /* */
514 movq_m2r(*(dataptr+8), mm1); /* n03:n02|n01:n00 - first line */
515 punpckldq_r2r(mm2, mm0); /* m30:m20|m10:m00 - interleave to produce first result */
517 movq_m2r(*(dataptr+12), mm3); /* n23:n22|n21:n20 - third line */
518 punpckhdq_r2r(mm2, mm4); /* m31:m21|m11:m01 - interleave to produce second result */
520 punpckhwd_m2r(*(dataptr+3), mm6); /* m13:m03|m12:m02 - interleave first and second lines */
521 movq_r2r(mm1, mm2); /* copy first line */
523 punpckhwd_m2r(*(dataptr+7), mm7); /* m33:m23|m32:m22 - interleave third and fourth lines */
524 movq_r2r(mm6, mm5); /* copy first intermediate result */
526 movq_r2m(mm0, *(dataptr+8)); /* write result 1 */
527 punpckhdq_r2r(mm7, mm5); /* m33:m23|m13:m03 - produce third result */
529 punpcklwd_m2r(*(dataptr+10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */
530 movq_r2r(mm3, mm0); /* copy third line */
532 punpckhwd_m2r(*(dataptr+10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */
534 movq_r2m(mm4, *(dataptr+10)); /* write result 2 out */
535 punpckldq_r2r(mm7, mm6); /* m32:m22|m12:m02 - produce fourth result */
537 punpcklwd_m2r(*(dataptr+14), mm3); /* n31:n21|n30:n20 - interleave third and fourth lines */
538 movq_r2r(mm1, mm4);
540 movq_r2m(mm6, *(dataptr+12)); /* write result 3 out */
541 punpckldq_r2r(mm3, mm1); /* n30:n20|n10:n00 - produce first result */
543 punpckhwd_m2r(*(dataptr+14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */
544 movq_r2r(mm2, mm6);
546 movq_r2m(mm5, *(dataptr+14)); /* write result 4 out */
547 punpckhdq_r2r(mm3, mm4); /* n31:n21|n11:n01- produce second result */
549 movq_r2m(mm1, *(dataptr+1)); /* write result 5 out - (first result for other 4 x 4 block) */
550 punpckldq_r2r(mm0, mm2); /* n32:n22|n12:n02- produce third result */
552 movq_r2m(mm4, *(dataptr+3)); /* write result 6 out */
553 punpckhdq_r2r(mm0, mm6); /* n33:n23|n13:n03 - produce fourth result */
555 movq_r2m(mm2, *(dataptr+5)); /* write result 7 out */
557 movq_m2r(*dataptr, mm0); /* m03:m02|m01:m00 - first line, first 4x4 */
559 movq_r2m(mm6, *(dataptr+7)); /* write result 8 out */
562 /* Do first 4x4 quadrant, which is used in the beginning of the DCT: */
564 movq_m2r(*(dataptr+4), mm7); /* m23:m22|m21:m20 - third line */
565 movq_r2r(mm0, mm2);
567 punpcklwd_m2r(*(dataptr+2), mm0); /* m11:m01|m10:m00 - interleave first and second lines */
568 movq_r2r(mm7, mm4);
570 punpcklwd_m2r(*(dataptr+6), mm7); /* m31:m21|m30:m20 - interleave third and fourth lines */
571 movq_r2r(mm0, mm1);
573 movq_m2r(*(dataptr+2), mm6); /* m13:m12|m11:m10 - second line */
574 punpckldq_r2r(mm7, mm0); /* m30:m20|m10:m00 - interleave to produce result 1 */
576 movq_m2r(*(dataptr+6), mm5); /* m33:m32|m31:m30 - fourth line */
577 punpckhdq_r2r(mm7, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */
579 movq_r2r(mm0, mm7); /* write result 1 */
580 punpckhwd_r2r(mm6, mm2); /* m13:m03|m12:m02 - interleave first and second lines */
582 psubw_m2r(*(dataptr+14), mm7); /* tmp07=x0-x7 /* Stage 1 */ */
583 movq_r2r(mm1, mm6); /* write result 2 */
585 paddw_m2r(*(dataptr+14), mm0); /* tmp00=x0+x7 /* Stage 1 */ */
586 punpckhwd_r2r(mm5, mm4); /* m33:m23|m32:m22 - interleave third and fourth lines */
588 paddw_m2r(*(dataptr+12), mm1); /* tmp01=x1+x6 /* Stage 1 */ */
589 movq_r2r(mm2, mm3); /* copy first intermediate result */
591 psubw_m2r(*(dataptr+12), mm6); /* tmp06=x1-x6 /* Stage 1 */ */
592 punpckldq_r2r(mm4, mm2); /* m32:m22|m12:m02 - interleave to produce result 3 */
594 movq_r2m(mm7, tmp7);
595 movq_r2r(mm2, mm5); /* write result 3 */
597 movq_r2m(mm6, tmp6);
598 punpckhdq_r2r(mm4, mm3); /* m33:m23|m13:m03 - interleave to produce result 4 */
600 paddw_m2r(*(dataptr+10), mm2); /* tmp02=x2+5 /* Stage 1 */ */
601 movq_r2r(mm3, mm4); /* write result 4 */
603 /************************************************************************************************
604 End of Transpose
605 ************************************************************************************************/
608 paddw_m2r(*(dataptr+8), mm3); /* tmp03=x3+x4 /* stage 1*/ */
609 movq_r2r(mm0, mm7);
611 psubw_m2r(*(dataptr+8), mm4); /* tmp04=x3-x4 /* stage 1*/ */
612 movq_r2r(mm1, mm6);
614 paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03 /* even 2 */ */
615 psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03 /* even 2 */ */
617 psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02 /* even 2 */ */
618 paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02 /* even 2 */ */
620 psubw_m2r(*(dataptr+10), mm5); /* tmp05=x2-x5 /* stage 1*/ */
621 paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */
623 /* stage 3 */
625 movq_m2r(tmp6, mm2);
626 movq_r2r(mm0, mm3);
628 psllw_i2r(2, mm6); /* m8 * 2^2 */
629 paddw_r2r(mm1, mm0);
631 pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */
632 psubw_r2r(mm1, mm3);
634 movq_r2m(mm0, *dataptr);
635 movq_r2r(mm7, mm0);
637 /* Odd part */
638 movq_r2m(mm3, *(dataptr+8));
639 paddw_r2r(mm5, mm4); /* tmp10 */
641 movq_m2r(tmp7, mm3);
642 paddw_r2r(mm6, mm0); /* tmp32 */
644 paddw_r2r(mm2, mm5); /* tmp11 */
645 psubw_r2r(mm6, mm7); /* tmp33 */
647 movq_r2m(mm0, *(dataptr+4));
648 paddw_r2r(mm3, mm2); /* tmp12 */
650 /* stage 4 */
652 movq_r2m(mm7, *(dataptr+12));
653 movq_r2r(mm4, mm1); /* copy of tmp10 */
655 psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */
656 psllw_i2r(2, mm4); /* m8 * 2^2 */
658 movq_m2r(RTjpeg_C2mC6, mm0);
659 psllw_i2r(2, mm1);
661 pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */
662 psllw_i2r(2, mm2);
664 pmulhw_r2r(mm0, mm4); /* z5 */
666 /* stage 5 */
668 pmulhw_m2r(RTjpeg_C2pC6, mm2);
669 psllw_i2r(2, mm5);
671 pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */
672 movq_r2r(mm3, mm0); /* copy tmp7 */
674 movq_m2r(*(dataptr+1), mm7);
675 paddw_r2r(mm1, mm4); /* z2 */
677 paddw_r2r(mm1, mm2); /* z4 */
679 paddw_r2r(mm5, mm0); /* z11 */
680 psubw_r2r(mm5, mm3); /* z13 */
682 /* stage 6 */
684 movq_r2r(mm3, mm5); /* copy z13 */
685 psubw_r2r(mm4, mm3); /* y3=z13 - z2 */
687 paddw_r2r(mm4, mm5); /* y5=z13 + z2 */
688 movq_r2r(mm0, mm6); /* copy z11 */
690 movq_r2m(mm3, *(dataptr+6)); /*save y3 */
691 psubw_r2r(mm2, mm0); /* y7=z11 - z4 */
693 movq_r2m(mm5, *(dataptr+10)); /*save y5 */
694 paddw_r2r(mm2, mm6); /* y1=z11 + z4 */
696 movq_r2m(mm0, *(dataptr+14)); /*save y7 */
698 /************************************************
699 * End of 1st 4 rows
700 ************************************************/
702 movq_m2r(*(dataptr+3), mm1); /* load x1 /* stage 1 */ */
703 movq_r2r(mm7, mm0); /* copy x0 */
705 movq_r2m(mm6, *(dataptr+2)); /*save y1 */
707 movq_m2r(*(dataptr+5), mm2); /* load x2 /* stage 1 */ */
708 movq_r2r(mm1, mm6); /* copy x1 */
710 paddw_m2r(*(dataptr+15), mm0); /* tmp00 = x0 + x7 */
712 movq_m2r(*(dataptr+7), mm3); /* load x3 /* stage 1 */ */
713 movq_r2r(mm2, mm5); /* copy x2 */
715 psubw_m2r(*(dataptr+15), mm7); /* tmp07 = x0 - x7 */
716 movq_r2r(mm3, mm4); /* copy x3 */
718 paddw_m2r(*(dataptr+13), mm1); /* tmp01 = x1 + x6 */
720 movq_r2m(mm7, tmp7); /* save tmp07 */
721 movq_r2r(mm0, mm7); /* copy tmp00 */
723 psubw_m2r(*(dataptr+13), mm6); /* tmp06 = x1 - x6 */
725 /* stage 2, Even Part */
727 paddw_m2r(*(dataptr+9), mm3); /* tmp03 = x3 + x4 */
729 movq_r2m(mm6, tmp6); /* save tmp07 */
730 movq_r2r(mm1, mm6); /* copy tmp01 */
732 paddw_m2r(*(dataptr+11), mm2); /* tmp02 = x2 + x5 */
733 paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03 */
735 psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03 */
737 psubw_m2r(*(dataptr+9), mm4); /* tmp04 = x3 - x4 */
738 psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02 */
740 paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02 */
742 psubw_m2r(*(dataptr+11), mm5); /* tmp05 = x2 - x5 */
743 paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */
745 /* stage 3, Even and stage 4 & 5 even */
747 movq_m2r(tmp6, mm2); /* load tmp6 */
748 movq_r2r(mm0, mm3); /* copy tmp10 */
750 psllw_i2r(2, mm6); /* shift z1 */
751 paddw_r2r(mm1, mm0); /* y0=tmp10 + tmp11 */
753 pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */
754 psubw_r2r(mm1, mm3); /* y4=tmp10 - tmp11 */
756 movq_r2m(mm0, *(dataptr+1)); /*save y0 */
757 movq_r2r(mm7, mm0); /* copy tmp13 */
759 /* odd part */
761 movq_r2m(mm3, *(dataptr+9)); /*save y4 */
762 paddw_r2r(mm5, mm4); /* tmp10 = tmp4 + tmp5 */
764 movq_m2r(tmp7, mm3); /* load tmp7 */
765 paddw_r2r(mm6, mm0); /* tmp32 = tmp13 + z1 */
767 paddw_r2r(mm2, mm5); /* tmp11 = tmp5 + tmp6 */
768 psubw_r2r(mm6, mm7); /* tmp33 = tmp13 - z1 */
770 movq_r2m(mm0, *(dataptr+5)); /*save y2 */
771 paddw_r2r(mm3, mm2); /* tmp12 = tmp6 + tmp7 */
773 /* stage 4 */
775 movq_r2m(mm7, *(dataptr+13)); /*save y6 */
776 movq_r2r(mm4, mm1); /* copy tmp10 */
778 psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */
779 psllw_i2r(2, mm4); /* shift tmp10 */
781 movq_m2r(RTjpeg_C2mC6, mm0); /* load C2mC6 */
782 psllw_i2r(2, mm1); /* shift (tmp10-tmp12) */
784 pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */
785 psllw_i2r(2, mm5); /* prepare for multiply */
787 pmulhw_r2r(mm0, mm4); /* multiply by converted real */
789 /* stage 5 */
791 pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */
792 psllw_i2r(2, mm2); /* prepare for multiply */
794 pmulhw_m2r(RTjpeg_C2pC6, mm2); /* multiply */
795 movq_r2r(mm3, mm0); /* copy tmp7 */
797 movq_m2r(*(dataptr+9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */
798 paddw_r2r(mm1, mm4); /* z2 */
800 paddw_r2r(mm5, mm0); /* z11 */
801 psubw_r2r(mm5, mm3); /* z13 */
803 /* stage 6 */
805 movq_r2r(mm3, mm5); /* copy z13 */
806 paddw_r2r(mm1, mm2); /* z4 */
808 movq_r2r(mm0, mm6); /* copy z11 */
809 psubw_r2r(mm4, mm5); /* y3 */
811 paddw_r2r(mm2, mm6); /* y1 */
812 paddw_r2r(mm4, mm3); /* y5 */
814 movq_r2m(mm5, *(dataptr+7)); /*save y3 */
816 movq_r2m(mm6, *(dataptr+3)); /*save y1 */
817 psubw_r2r(mm2, mm0); /* y7 */
819 /************************************************************************************************
820 Start of Transpose
821 ************************************************************************************************/
823 movq_m2r(*(dataptr+13), mm6); /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */
824 movq_r2r(mm7, mm5); /* copy first line */
826 punpcklwd_r2r(mm3, mm7); /* m11:m01|m10:m00 - interleave first and second lines */
827 movq_r2r(mm6, mm2); /* copy third line */
829 punpcklwd_r2r(mm0, mm6); /* m31:m21|m30:m20 - interleave third and fourth lines */
830 movq_r2r(mm7, mm1); /* copy first intermediate result */
832 punpckldq_r2r(mm6, mm7); /* m30:m20|m10:m00 - interleave to produce result 1 */
834 punpckhdq_r2r(mm6, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */
836 movq_r2m(mm7, *(dataptr+9)); /* write result 1 */
837 punpckhwd_r2r(mm3, mm5); /* m13:m03|m12:m02 - interleave first and second lines */
839 movq_r2m(mm1, *(dataptr+11)); /* write result 2 */
840 punpckhwd_r2r(mm0, mm2); /* m33:m23|m32:m22 - interleave third and fourth lines */
842 movq_r2r(mm5, mm1); /* copy first intermediate result */
843 punpckldq_r2r(mm2, mm5); /* m32:m22|m12:m02 - interleave to produce result 3 */
845 movq_m2r(*(dataptr+1), mm0); /* m03:m02|m01:m00 - first line, 4x4 */
846 punpckhdq_r2r(mm2, mm1); /* m33:m23|m13:m03 - interleave to produce result 4 */
848 movq_r2m(mm5, *(dataptr+13)); /* write result 3 */
850 /****** last 4x4 done */
852 movq_r2m(mm1, *(dataptr+15)); /* write result 4, last 4x4 */
854 movq_m2r(*(dataptr+5), mm2); /* m23:m22|m21:m20 - third line */
855 movq_r2r(mm0, mm6); /* copy first line */
857 punpcklwd_m2r(*(dataptr+3), mm0); /* m11:m01|m10:m00 - interleave first and second lines */
858 movq_r2r(mm2, mm7); /* copy third line */
860 punpcklwd_m2r(*(dataptr+7), mm2); /* m31:m21|m30:m20 - interleave third and fourth lines */
861 movq_r2r(mm0, mm4); /* copy first intermediate result */
865 movq_m2r(*(dataptr+8), mm1); /* n03:n02|n01:n00 - first line */
866 punpckldq_r2r(mm2, mm0); /* m30:m20|m10:m00 - interleave to produce first result */
868 movq_m2r(*(dataptr+12), mm3); /* n23:n22|n21:n20 - third line */
869 punpckhdq_r2r(mm2, mm4); /* m31:m21|m11:m01 - interleave to produce second result */
871 punpckhwd_m2r(*(dataptr+3), mm6); /* m13:m03|m12:m02 - interleave first and second lines */
872 movq_r2r(mm1, mm2); /* copy first line */
874 punpckhwd_m2r(*(dataptr+7), mm7); /* m33:m23|m32:m22 - interleave third and fourth lines */
875 movq_r2r(mm6, mm5); /* copy first intermediate result */
877 movq_r2m(mm0, *(dataptr+8)); /* write result 1 */
878 punpckhdq_r2r(mm7, mm5); /* m33:m23|m13:m03 - produce third result */
880 punpcklwd_m2r(*(dataptr+10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */
881 movq_r2r(mm3, mm0); /* copy third line */
883 punpckhwd_m2r(*(dataptr+10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */
885 movq_r2m(mm4, *(dataptr+10)); /* write result 2 out */
886 punpckldq_r2r(mm7, mm6); /* m32:m22|m12:m02 - produce fourth result */
888 punpcklwd_m2r(*(dataptr+14), mm3); /* n33:n23|n32:n22 - interleave third and fourth lines */
889 movq_r2r(mm1, mm4); /* copy second intermediate result */
891 movq_r2m(mm6, *(dataptr+12)); /* write result 3 out */
892 punpckldq_r2r(mm3, mm1); /* */
894 punpckhwd_m2r(*(dataptr+14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */
895 movq_r2r(mm2, mm6); /* copy second intermediate result */
897 movq_r2m(mm5, *(dataptr+14)); /* write result 4 out */
898 punpckhdq_r2r(mm3, mm4); /* n31:n21|n11:n01- produce second result */
900 movq_r2m(mm1, *(dataptr+1)); /* write result 5 out - (first result for other 4 x 4 block) */
901 punpckldq_r2r(mm0, mm2); /* n32:n22|n12:n02- produce third result */
903 movq_r2m(mm4, *(dataptr+3)); /* write result 6 out */
904 punpckhdq_r2r(mm0, mm6); /* n33:n23|n13:n03 - produce fourth result */
906 movq_r2m(mm2, *(dataptr+5)); /* write result 7 out */
908 movq_m2r(*dataptr, mm0); /* m03:m02|m01:m00 - first line, first 4x4 */
910 movq_r2m(mm6, *(dataptr+7)); /* write result 8 out */
912 /* Do first 4x4 quadrant, which is used in the beginning of the DCT: */
914 movq_m2r(*(dataptr+4), mm7); /* m23:m22|m21:m20 - third line */
915 movq_r2r(mm0, mm2); /* copy first line */
917 punpcklwd_m2r(*(dataptr+2), mm0); /* m11:m01|m10:m00 - interleave first and second lines */
918 movq_r2r(mm7, mm4); /* copy third line */
920 punpcklwd_m2r(*(dataptr+6), mm7); /* m31:m21|m30:m20 - interleave third and fourth lines */
921 movq_r2r(mm0, mm1); /* copy first intermediate result */
923 movq_m2r(*(dataptr+2), mm6); /* m13:m12|m11:m10 - second line */
924 punpckldq_r2r(mm7, mm0); /* m30:m20|m10:m00 - interleave to produce result 1 */
926 movq_m2r(*(dataptr+6), mm5); /* m33:m32|m31:m30 - fourth line */
927 punpckhdq_r2r(mm7, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */
929 movq_r2r(mm0, mm7); /* write result 1 */
930 punpckhwd_r2r(mm6, mm2); /* m13:m03|m12:m02 - interleave first and second lines */
932 psubw_m2r(*(dataptr+14), mm7); /* tmp07=x0-x7 /* Stage 1 */ */
933 movq_r2r(mm1, mm6); /* write result 2 */
935 paddw_m2r(*(dataptr+14), mm0); /* tmp00=x0+x7 /* Stage 1 */ */
936 punpckhwd_r2r(mm5, mm4); /* m33:m23|m32:m22 - interleave third and fourth lines */
938 paddw_m2r(*(dataptr+12), mm1); /* tmp01=x1+x6 /* Stage 1 */ */
939 movq_r2r(mm2, mm3); /* copy first intermediate result */
941 psubw_m2r(*(dataptr+12), mm6); /* tmp06=x1-x6 /* Stage 1 */ */
942 punpckldq_r2r(mm4, mm2); /* m32:m22|m12:m02 - interleave to produce result 3 */
944 movq_r2m(mm7, tmp7); /* save tmp07 */
945 movq_r2r(mm2, mm5); /* write result 3 */
947 movq_r2m(mm6, tmp6); /* save tmp06 */
949 punpckhdq_r2r(mm4, mm3); /* m33:m23|m13:m03 - interleave to produce result 4 */
951 paddw_m2r(*(dataptr+10), mm2); /* tmp02=x2+x5 /* stage 1 */ */
952 movq_r2r(mm3, mm4); /* write result 4 */
954 /************************************************************************************************
955 End of Transpose 2
956 ************************************************************************************************/
958 paddw_m2r(*(dataptr+8), mm3); /* tmp03=x3+x4 /* stage 1*/ */
959 movq_r2r(mm0, mm7);
961 psubw_m2r(*(dataptr+8), mm4); /* tmp04=x3-x4 /* stage 1*/ */
962 movq_r2r(mm1, mm6);
964 paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03 /* even 2 */ */
965 psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03 /* even 2 */ */
967 psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02 /* even 2 */ */
968 paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02 /* even 2 */ */
970 psubw_m2r(*(dataptr+10), mm5); /* tmp05=x2-x5 /* stage 1*/ */
971 paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */
973 /* stage 3 */
975 movq_m2r(tmp6, mm2);
976 movq_r2r(mm0, mm3);
978 psllw_i2r(2, mm6); /* m8 * 2^2 */
979 paddw_r2r(mm1, mm0);
981 pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */
982 psubw_r2r(mm1, mm3);
984 movq_r2m(mm0, *dataptr);
985 movq_r2r(mm7, mm0);
987 /* Odd part */
988 movq_r2m(mm3, *(dataptr+8));
989 paddw_r2r(mm5, mm4); /* tmp10 */
991 movq_m2r(tmp7, mm3);
992 paddw_r2r(mm6, mm0); /* tmp32 */
994 paddw_r2r(mm2, mm5); /* tmp11 */
995 psubw_r2r(mm6, mm7); /* tmp33 */
997 movq_r2m(mm0, *(dataptr+4));
998 paddw_r2r(mm3, mm2); /* tmp12 */
1000 /* stage 4 */
1001 movq_r2m(mm7, *(dataptr+12));
1002 movq_r2r(mm4, mm1); /* copy of tmp10 */
1004 psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */
1005 psllw_i2r(2, mm4); /* m8 * 2^2 */
1007 movq_m2r(RTjpeg_C2mC6, mm0);
1008 psllw_i2r(2, mm1);
1010 pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */
1011 psllw_i2r(2, mm2);
1013 pmulhw_r2r(mm0, mm4); /* z5 */
1015 /* stage 5 */
1017 pmulhw_m2r(RTjpeg_C2pC6, mm2);
1018 psllw_i2r(2, mm5);
1020 pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */
1021 movq_r2r(mm3, mm0); /* copy tmp7 */
1023 movq_m2r(*(dataptr+1), mm7);
1024 paddw_r2r(mm1, mm4); /* z2 */
1026 paddw_r2r(mm1, mm2); /* z4 */
1028 paddw_r2r(mm5, mm0); /* z11 */
1029 psubw_r2r(mm5, mm3); /* z13 */
1031 /* stage 6 */
1033 movq_r2r(mm3, mm5); /* copy z13 */
1034 psubw_r2r(mm4, mm3); /* y3=z13 - z2 */
1036 paddw_r2r(mm4, mm5); /* y5=z13 + z2 */
1037 movq_r2r(mm0, mm6); /* copy z11 */
1039 movq_r2m(mm3, *(dataptr+6)); /*save y3 */
1040 psubw_r2r(mm2, mm0); /* y7=z11 - z4 */
1042 movq_r2m(mm5, *(dataptr+10)); /*save y5 */
1043 paddw_r2r(mm2, mm6); /* y1=z11 + z4 */
1045 movq_r2m(mm0, *(dataptr+14)); /*save y7 */
1047 /************************************************
1048 * End of 1st 4 rows
1049 ************************************************/
1051 movq_m2r(*(dataptr+3), mm1); /* load x1 /* stage 1 */ */
1052 movq_r2r(mm7, mm0); /* copy x0 */
1054 movq_r2m(mm6, *(dataptr+2)); /*save y1 */
1056 movq_m2r(*(dataptr+5), mm2); /* load x2 /* stage 1 */ */
1057 movq_r2r(mm1, mm6); /* copy x1 */
1059 paddw_m2r(*(dataptr+15), mm0); /* tmp00 = x0 + x7 */
1061 movq_m2r(*(dataptr+7), mm3); /* load x3 /* stage 1 */ */
1062 movq_r2r(mm2, mm5); /* copy x2 */
1064 psubw_m2r(*(dataptr+15), mm7); /* tmp07 = x0 - x7 */
1065 movq_r2r(mm3, mm4); /* copy x3 */
1067 paddw_m2r(*(dataptr+13), mm1); /* tmp01 = x1 + x6 */
1069 movq_r2m(mm7, tmp7); /* save tmp07 */
1070 movq_r2r(mm0, mm7); /* copy tmp00 */
1072 psubw_m2r(*(dataptr+13), mm6); /* tmp06 = x1 - x6 */
1074 /* stage 2, Even Part */
1076 paddw_m2r(*(dataptr+9), mm3); /* tmp03 = x3 + x4 */
1078 movq_r2m(mm6, tmp6); /* save tmp07 */
1079 movq_r2r(mm1, mm6); /* copy tmp01 */
1081 paddw_m2r(*(dataptr+11), mm2); /* tmp02 = x2 + x5 */
1082 paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03 */
1084 psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03 */
1086 psubw_m2r(*(dataptr+9), mm4); /* tmp04 = x3 - x4 */
1087 psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02 */
1089 paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02 */
1091 psubw_m2r(*(dataptr+11), mm5); /* tmp05 = x2 - x5 */
1092 paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */
1094 /* stage 3, Even and stage 4 & 5 even */
1096 movq_m2r(tmp6, mm2); /* load tmp6 */
1097 movq_r2r(mm0, mm3); /* copy tmp10 */
1099 psllw_i2r(2, mm6); /* shift z1 */
1100 paddw_r2r(mm1, mm0); /* y0=tmp10 + tmp11 */
1102 pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */
1103 psubw_r2r(mm1, mm3); /* y4=tmp10 - tmp11 */
1105 movq_r2m(mm0, *(dataptr+1)); /*save y0 */
1106 movq_r2r(mm7, mm0); /* copy tmp13 */
1108 /* odd part */
1110 movq_r2m(mm3, *(dataptr+9)); /*save y4 */
1111 paddw_r2r(mm5, mm4); /* tmp10 = tmp4 + tmp5 */
1113 movq_m2r(tmp7, mm3); /* load tmp7 */
1114 paddw_r2r(mm6, mm0); /* tmp32 = tmp13 + z1 */
1116 paddw_r2r(mm2, mm5); /* tmp11 = tmp5 + tmp6 */
1117 psubw_r2r(mm6, mm7); /* tmp33 = tmp13 - z1 */
1119 movq_r2m(mm0, *(dataptr+5)); /*save y2 */
1120 paddw_r2r(mm3, mm2); /* tmp12 = tmp6 + tmp7 */
1122 /* stage 4 */
1124 movq_r2m(mm7, *(dataptr+13)); /*save y6 */
1125 movq_r2r(mm4, mm1); /* copy tmp10 */
1127 psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */
1128 psllw_i2r(2, mm4); /* shift tmp10 */
1130 movq_m2r(RTjpeg_C2mC6, mm0); /* load C2mC6 */
1131 psllw_i2r(2, mm1); /* shift (tmp10-tmp12) */
1133 pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */
1134 psllw_i2r(2, mm5); /* prepare for multiply */
1136 pmulhw_r2r(mm0, mm4); /* multiply by converted real */
1138 /* stage 5 */
1140 pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */
1141 psllw_i2r(2, mm2); /* prepare for multiply */
1143 pmulhw_m2r(RTjpeg_C2pC6, mm2); /* multiply */
1144 movq_r2r(mm3, mm0); /* copy tmp7 */
1146 movq_m2r(*(dataptr+9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */
1147 paddw_r2r(mm1, mm4); /* z2 */
1149 paddw_r2r(mm5, mm0); /* z11 */
1150 psubw_r2r(mm5, mm3); /* z13 */
1152 /* stage 6 */
1154 movq_r2r(mm3, mm5); /* copy z13 */
1155 paddw_r2r(mm1, mm2); /* z4 */
1157 movq_r2r(mm0, mm6); /* copy z11 */
1158 psubw_r2r(mm4, mm5); /* y3 */
1160 paddw_r2r(mm2, mm6); /* y1 */
1161 paddw_r2r(mm4, mm3); /* y5 */
1163 movq_r2m(mm5, *(dataptr+7)); /*save y3 */
1164 psubw_r2r(mm2, mm0); /* yè=z11 - z4 */
1166 movq_r2m(mm3, *(dataptr+11)); /*save y5 */
1168 movq_r2m(mm6, *(dataptr+3)); /*save y1 */
1170 movq_r2m(mm0, *(dataptr+15)); /*save y7 */
1173 #endif
1176 #define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */
1177 #define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */
1178 #define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */
1179 #define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */
1181 #define DESCALE(x) (__s16)( ((x)+4) >> 3)
1183 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
1185 #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
1186 #define MULTIPLY(var,const) (((__s32) ((var) * (const)) + 128)>>8)
1188 void RTjpeg_idct_init(void)
1190 int i;
1192 for(i=0; i<64; i++)
1194 RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32;
1195 RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32;
1199 void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip)
1201 #ifdef USE_MMX
1203 static mmx_t fix_141 = (mmx_t)(long long)0x5a825a825a825a82LL;
1204 static mmx_t fix_184n261 = (mmx_t)(long long)0xcf04cf04cf04cf04LL;
1205 static mmx_t fix_184 = (mmx_t)(long long)0x7641764176417641LL;
1206 static mmx_t fix_n184 = (mmx_t)(long long)0x896f896f896f896fLL;
1207 static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL;
1209 mmx_t workspace[64];
1210 mmx_t *wsptr = workspace;
1211 register mmx_t *dataptr = (mmx_t *)odata;
1212 mmx_t *idata = (mmx_t *)data;
1214 rskip = rskip>>3;
1216 * Perform inverse DCT on one block of coefficients.
1219 /* Odd part */
1221 movq_m2r(*(idata+10), mm1); /* load idata[DCTSIZE*5] */
1223 movq_m2r(*(idata+6), mm0); /* load idata[DCTSIZE*3] */
1225 movq_m2r(*(idata+2), mm3); /* load idata[DCTSIZE*1] */
1227 movq_r2r(mm1, mm2); /* copy tmp6 /* phase 6 */ */
1229 movq_m2r(*(idata+14), mm4); /* load idata[DCTSIZE*7] */
1231 paddw_r2r(mm0, mm1); /* z13 = tmp6 + tmp5; */
1233 psubw_r2r(mm0, mm2); /* z10 = tmp6 - tmp5 */
1235 psllw_i2r(2, mm2); /* shift z10 */
1236 movq_r2r(mm2, mm0); /* copy z10 */
1238 pmulhw_m2r(fix_184n261, mm2); /* MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ */
1239 movq_r2r(mm3, mm5); /* copy tmp4 */
1241 pmulhw_m2r(fix_n184, mm0); /* MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ */
1242 paddw_r2r(mm4, mm3); /* z11 = tmp4 + tmp7; */
1244 movq_r2r(mm3, mm6); /* copy z11 /* phase 5 */ */
1245 psubw_r2r(mm4, mm5); /* z12 = tmp4 - tmp7; */
1247 psubw_r2r(mm1, mm6); /* z11-z13 */
1248 psllw_i2r(2, mm5); /* shift z12 */
1250 movq_m2r(*(idata+12), mm4); /* load idata[DCTSIZE*6], even part */
1251 movq_r2r(mm5, mm7); /* copy z12 */
1253 pmulhw_m2r(fix_108n184, mm5); /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part */
1254 paddw_r2r(mm1, mm3); /* tmp7 = z11 + z13; */
1256 /*ok */
1258 /* Even part */
1259 pmulhw_m2r(fix_184, mm7); /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ */
1260 psllw_i2r(2, mm6);
1262 movq_m2r(*(idata+4), mm1); /* load idata[DCTSIZE*2] */
1264 paddw_r2r(mm5, mm0); /* tmp10 */
1266 paddw_r2r(mm7, mm2); /* tmp12 */
1268 pmulhw_m2r(fix_141, mm6); /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ */
1269 psubw_r2r(mm3, mm2); /* tmp6 = tmp12 - tmp7 */
1271 movq_r2r(mm1, mm5); /* copy tmp1 */
1272 paddw_r2r(mm4, mm1); /* tmp13= tmp1 + tmp3; /* phases 5-3 */ */
1274 psubw_r2r(mm4, mm5); /* tmp1-tmp3 */
1275 psubw_r2r(mm2, mm6); /* tmp5 = tmp11 - tmp6; */
1277 movq_r2m(mm1, *(wsptr)); /* save tmp13 in workspace */
1278 psllw_i2r(2, mm5); /* shift tmp1-tmp3 */
1280 movq_m2r(*(idata), mm7); /* load idata[DCTSIZE*0] */
1282 pmulhw_m2r(fix_141, mm5); /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */
1283 paddw_r2r(mm6, mm0); /* tmp4 = tmp10 + tmp5; */
1285 movq_m2r(*(idata+8), mm4); /* load idata[DCTSIZE*4] */
1287 psubw_r2r(mm1, mm5); /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ */
1289 movq_r2m(mm0, *(wsptr+4)); /* save tmp4 in workspace */
1290 movq_r2r(mm7, mm1); /* copy tmp0 /* phase 3 */ */
1292 movq_r2m(mm5, *(wsptr+2)); /* save tmp12 in workspace */
1293 psubw_r2r(mm4, mm1); /* tmp11 = tmp0 - tmp2; */
1295 paddw_r2r(mm4, mm7); /* tmp10 = tmp0 + tmp2; */
1296 movq_r2r(mm1, mm5); /* copy tmp11 */
1298 paddw_m2r(*(wsptr+2), mm1); /* tmp1 = tmp11 + tmp12; */
1299 movq_r2r(mm7, mm4); /* copy tmp10 /* phase 2 */ */
1301 paddw_m2r(*(wsptr), mm7); /* tmp0 = tmp10 + tmp13; */
1303 psubw_m2r(*(wsptr), mm4); /* tmp3 = tmp10 - tmp13; */
1304 movq_r2r(mm7, mm0); /* copy tmp0 */
1306 psubw_m2r(*(wsptr+2), mm5); /* tmp2 = tmp11 - tmp12; */
1307 paddw_r2r(mm3, mm7); /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */
1309 psubw_r2r(mm3, mm0); /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */
1311 movq_r2m(mm7, *(wsptr)); /* wsptr[DCTSIZE*0] */
1312 movq_r2r(mm1, mm3); /* copy tmp1 */
1314 movq_r2m(mm0, *(wsptr+14)); /* wsptr[DCTSIZE*7] */
1315 paddw_r2r(mm2, mm1); /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */
1317 psubw_r2r(mm2, mm3); /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */
1319 movq_r2m(mm1, *(wsptr+2)); /* wsptr[DCTSIZE*1] */
1320 movq_r2r(mm4, mm1); /* copy tmp3 */
1322 movq_r2m(mm3, *(wsptr+12)); /* wsptr[DCTSIZE*6] */
1324 paddw_m2r(*(wsptr+4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */
1326 psubw_m2r(*(wsptr+4), mm1); /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */
1328 movq_r2m(mm4, *(wsptr+8));
1329 movq_r2r(mm5, mm7); /* copy tmp2 */
1331 paddw_r2r(mm6, mm5); /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */
1333 movq_r2m(mm1, *(wsptr+6));
1334 psubw_r2r(mm6, mm7); /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */
1336 movq_r2m(mm5, *(wsptr+4));
1338 movq_r2m(mm7, *(wsptr+10));
1340 /*ok */
1343 /*****************************************************************/
1345 idata++;
1346 wsptr++;
1348 /*****************************************************************/
1350 movq_m2r(*(idata+10), mm1); /* load idata[DCTSIZE*5] */
1352 movq_m2r(*(idata+6), mm0); /* load idata[DCTSIZE*3] */
1354 movq_m2r(*(idata+2), mm3); /* load idata[DCTSIZE*1] */
1355 movq_r2r(mm1, mm2); /* copy tmp6 /* phase 6 */ */
1357 movq_m2r(*(idata+14), mm4); /* load idata[DCTSIZE*7] */
1358 paddw_r2r(mm0, mm1); /* z13 = tmp6 + tmp5; */
1360 psubw_r2r(mm0, mm2); /* z10 = tmp6 - tmp5 */
1362 psllw_i2r(2, mm2); /* shift z10 */
1363 movq_r2r(mm2, mm0); /* copy z10 */
1365 pmulhw_m2r(fix_184n261, mm2); /* MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ */
1366 movq_r2r(mm3, mm5); /* copy tmp4 */
1368 pmulhw_m2r(fix_n184, mm0); /* MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ */
1369 paddw_r2r(mm4, mm3); /* z11 = tmp4 + tmp7; */
1371 movq_r2r(mm3, mm6); /* copy z11 /* phase 5 */ */
1372 psubw_r2r(mm4, mm5); /* z12 = tmp4 - tmp7; */
1374 psubw_r2r(mm1, mm6); /* z11-z13 */
1375 psllw_i2r(2, mm5); /* shift z12 */
1377 movq_m2r(*(idata+12), mm4); /* load idata[DCTSIZE*6], even part */
1378 movq_r2r(mm5, mm7); /* copy z12 */
1380 pmulhw_m2r(fix_108n184, mm5); /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part */
1381 paddw_r2r(mm1, mm3); /* tmp7 = z11 + z13; */
1383 /*ok */
1385 /* Even part */
1386 pmulhw_m2r(fix_184, mm7); /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ */
1387 psllw_i2r(2, mm6);
1389 movq_m2r(*(idata+4), mm1); /* load idata[DCTSIZE*2] */
1391 paddw_r2r(mm5, mm0); /* tmp10 */
1393 paddw_r2r(mm7, mm2); /* tmp12 */
1395 pmulhw_m2r(fix_141, mm6); /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ */
1396 psubw_r2r(mm3, mm2); /* tmp6 = tmp12 - tmp7 */
1398 movq_r2r(mm1, mm5); /* copy tmp1 */
1399 paddw_r2r(mm4, mm1); /* tmp13= tmp1 + tmp3; /* phases 5-3 */ */
1401 psubw_r2r(mm4, mm5); /* tmp1-tmp3 */
1402 psubw_r2r(mm2, mm6); /* tmp5 = tmp11 - tmp6; */
1404 movq_r2m(mm1, *(wsptr)); /* save tmp13 in workspace */
1405 psllw_i2r(2, mm5); /* shift tmp1-tmp3 */
1407 movq_m2r(*(idata), mm7); /* load idata[DCTSIZE*0] */
1408 paddw_r2r(mm6, mm0); /* tmp4 = tmp10 + tmp5; */
1410 pmulhw_m2r(fix_141, mm5); /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */
1412 movq_m2r(*(idata+8), mm4); /* load idata[DCTSIZE*4] */
1414 psubw_r2r(mm1, mm5); /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ */
1416 movq_r2m(mm0, *(wsptr+4)); /* save tmp4 in workspace */
1417 movq_r2r(mm7, mm1); /* copy tmp0 /* phase 3 */ */
1419 movq_r2m(mm5, *(wsptr+2)); /* save tmp12 in workspace */
1420 psubw_r2r(mm4, mm1); /* tmp11 = tmp0 - tmp2; */
1422 paddw_r2r(mm4, mm7); /* tmp10 = tmp0 + tmp2; */
1423 movq_r2r(mm1, mm5); /* copy tmp11 */
1425 paddw_m2r(*(wsptr+2), mm1); /* tmp1 = tmp11 + tmp12; */
1426 movq_r2r(mm7, mm4); /* copy tmp10 /* phase 2 */ */
1428 paddw_m2r(*(wsptr), mm7); /* tmp0 = tmp10 + tmp13; */
1430 psubw_m2r(*(wsptr), mm4); /* tmp3 = tmp10 - tmp13; */
1431 movq_r2r(mm7, mm0); /* copy tmp0 */
1433 psubw_m2r(*(wsptr+2), mm5); /* tmp2 = tmp11 - tmp12; */
1434 paddw_r2r(mm3, mm7); /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */
1436 psubw_r2r(mm3, mm0); /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */
1438 movq_r2m(mm7, *(wsptr)); /* wsptr[DCTSIZE*0] */
1439 movq_r2r(mm1, mm3); /* copy tmp1 */
1441 movq_r2m(mm0, *(wsptr+14)); /* wsptr[DCTSIZE*7] */
1442 paddw_r2r(mm2, mm1); /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */
1444 psubw_r2r(mm2, mm3); /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */
1446 movq_r2m(mm1, *(wsptr+2)); /* wsptr[DCTSIZE*1] */
1447 movq_r2r(mm4, mm1); /* copy tmp3 */
1449 movq_r2m(mm3, *(wsptr+12)); /* wsptr[DCTSIZE*6] */
1451 paddw_m2r(*(wsptr+4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */
1453 psubw_m2r(*(wsptr+4), mm1); /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */
1455 movq_r2m(mm4, *(wsptr+8));
1456 movq_r2r(mm5, mm7); /* copy tmp2 */
1458 paddw_r2r(mm6, mm5); /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */
1460 movq_r2m(mm1, *(wsptr+6));
1461 psubw_r2r(mm6, mm7); /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */
1463 movq_r2m(mm5, *(wsptr+4));
1465 movq_r2m(mm7, *(wsptr+10));
1467 /*****************************************************************/
1469 /* Pass 2: process rows from work array, store into output array. */
1470 /* Note that we must descale the results by a factor of 8 == 2**3, */
1471 /* and also undo the PASS1_BITS scaling. */
1473 /*****************************************************************/
1474 /* Even part */
1476 wsptr--;
1478 /* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
1479 /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
1480 /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
1481 /* tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */
1482 movq_m2r(*(wsptr), mm0); /* wsptr[0,0],[0,1],[0,2],[0,3] */
1484 movq_m2r(*(wsptr+1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */
1485 movq_r2r(mm0, mm2);
1487 movq_m2r(*(wsptr+2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */
1488 paddw_r2r(mm1, mm0); /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */
1490 movq_m2r(*(wsptr+3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */
1491 psubw_r2r(mm1, mm2); /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */
1493 movq_r2r(mm0, mm6);
1494 movq_r2r(mm3, mm5);
1496 paddw_r2r(mm4, mm3); /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */
1497 movq_r2r(mm2, mm1);
1499 psubw_r2r(mm4, mm5); /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */
1500 punpcklwd_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */
1502 movq_m2r(*(wsptr+7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */
1503 punpckhwd_r2r(mm3, mm6); /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */
1505 movq_m2r(*(wsptr+4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */
1506 punpckldq_r2r(mm6, mm0); /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
1508 punpcklwd_r2r(mm5, mm1); /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */
1509 movq_r2r(mm3, mm4);
1511 movq_m2r(*(wsptr+6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */
1512 punpckhwd_r2r(mm5, mm2); /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */
1514 movq_m2r(*(wsptr+5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */
1515 punpckldq_r2r(mm2, mm1); /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */
1518 paddw_r2r(mm5, mm3); /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */
1519 movq_r2r(mm6, mm2);
1521 psubw_r2r(mm5, mm4); /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */
1522 paddw_r2r(mm7, mm6); /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */
1524 movq_r2r(mm3, mm5);
1525 punpcklwd_r2r(mm6, mm3); /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */
1527 psubw_r2r(mm7, mm2); /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */
1528 punpckhwd_r2r(mm6, mm5); /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */
1530 movq_r2r(mm4, mm7);
1531 punpckldq_r2r(mm5, mm3); /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */
1533 punpcklwd_r2r(mm2, mm4); /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */
1535 punpckhwd_r2r(mm2, mm7); /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */
1537 punpckldq_r2r(mm7, mm4); /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */
1538 movq_r2r(mm1, mm6);
1540 /*ok */
1542 /* mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
1543 /* mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */
1546 movq_r2r(mm0, mm2);
1547 punpckhdq_r2r(mm4, mm6); /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */
1549 punpckldq_r2r(mm4, mm1); /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */
1550 psllw_i2r(2, mm6);
1552 pmulhw_m2r(fix_141, mm6);
1553 punpckldq_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */
1555 punpckhdq_r2r(mm3, mm2); /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */
1556 movq_r2r(mm0, mm7);
1558 /* tmp0 = tmp10 + tmp13; */
1559 /* tmp3 = tmp10 - tmp13; */
1560 paddw_r2r(mm2, mm0); /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */
1561 psubw_r2r(mm2, mm7); /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */
1563 /* tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */
1564 psubw_r2r(mm2, mm6); /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */
1565 /* tmp1 = tmp11 + tmp12; */
1566 /* tmp2 = tmp11 - tmp12; */
1567 movq_r2r(mm1, mm5);
1569 /*OK */
1571 /* Odd part */
1573 /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
1574 /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
1575 /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
1576 /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
1577 movq_m2r(*(wsptr), mm3); /* wsptr[0,0],[0,1],[0,2],[0,3] */
1578 paddw_r2r(mm6, mm1); /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */
1580 movq_m2r(*(wsptr+1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */
1581 psubw_r2r(mm6, mm5); /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */
1583 movq_r2r(mm3, mm6);
1584 punpckldq_r2r(mm4, mm3); /* wsptr[0,0],[0,1],[0,4],[0,5] */
1586 punpckhdq_r2r(mm6, mm4); /* wsptr[0,6],[0,7],[0,2],[0,3] */
1587 movq_r2r(mm3, mm2);
1589 /*Save tmp0 and tmp1 in wsptr */
1590 movq_r2m(mm0, *(wsptr)); /* save tmp0 */
1591 paddw_r2r(mm4, mm2); /* wsptr[xxx],[0,z11],[xxx],[0,z13] */
1594 /*Continue with z10 --- z13 */
1595 movq_m2r(*(wsptr+2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */
1596 psubw_r2r(mm4, mm3); /* wsptr[xxx],[0,z12],[xxx],[0,z10] */
1598 movq_m2r(*(wsptr+3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */
1599 movq_r2r(mm6, mm4);
1601 movq_r2m(mm1, *(wsptr+1)); /* save tmp1 */
1602 punpckldq_r2r(mm0, mm6); /* wsptr[1,0],[1,1],[1,4],[1,5] */
1604 punpckhdq_r2r(mm4, mm0); /* wsptr[1,6],[1,7],[1,2],[1,3] */
1605 movq_r2r(mm6, mm1);
1607 /*Save tmp2 and tmp3 in wsptr */
1608 paddw_r2r(mm0, mm6); /* wsptr[xxx],[1,z11],[xxx],[1,z13] */
1609 movq_r2r(mm2, mm4);
1611 /*Continue with z10 --- z13 */
1612 movq_r2m(mm5, *(wsptr+2)); /* save tmp2 */
1613 punpcklwd_r2r(mm6, mm2); /* wsptr[xxx],[xxx],[0,z11],[1,z11] */
1615 psubw_r2r(mm0, mm1); /* wsptr[xxx],[1,z12],[xxx],[1,z10] */
1616 punpckhwd_r2r(mm6, mm4); /* wsptr[xxx],[xxx],[0,z13],[1,z13] */
1618 movq_r2r(mm3, mm0);
1619 punpcklwd_r2r(mm1, mm3); /* wsptr[xxx],[xxx],[0,z12],[1,z12] */
1621 movq_r2m(mm7, *(wsptr+3)); /* save tmp3 */
1622 punpckhwd_r2r(mm1, mm0); /* wsptr[xxx],[xxx],[0,z10],[1,z10] */
1624 movq_m2r(*(wsptr+4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */
1625 punpckhdq_r2r(mm2, mm0); /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */
1627 movq_m2r(*(wsptr+5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */
1628 punpckhdq_r2r(mm4, mm3); /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */
1630 movq_m2r(*(wsptr+6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */
1631 movq_r2r(mm6, mm4);
1633 punpckldq_r2r(mm7, mm6); /* wsptr[2,0],[2,1],[2,4],[2,5] */
1634 movq_r2r(mm1, mm5);
1636 punpckhdq_r2r(mm4, mm7); /* wsptr[2,6],[2,7],[2,2],[2,3] */
1637 movq_r2r(mm6, mm2);
1639 movq_m2r(*(wsptr+7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */
1640 paddw_r2r(mm7, mm6); /* wsptr[xxx],[2,z11],[xxx],[2,z13] */
1642 psubw_r2r(mm7, mm2); /* wsptr[xxx],[2,z12],[xxx],[2,z10] */
1643 punpckldq_r2r(mm4, mm1); /* wsptr[3,0],[3,1],[3,4],[3,5] */
1645 punpckhdq_r2r(mm5, mm4); /* wsptr[3,6],[3,7],[3,2],[3,3] */
1646 movq_r2r(mm1, mm7);
1648 paddw_r2r(mm4, mm1); /* wsptr[xxx],[3,z11],[xxx],[3,z13] */
1649 psubw_r2r(mm4, mm7); /* wsptr[xxx],[3,z12],[xxx],[3,z10] */
1651 movq_r2r(mm6, mm5);
1652 punpcklwd_r2r(mm1, mm6); /* wsptr[xxx],[xxx],[2,z11],[3,z11] */
1654 punpckhwd_r2r(mm1, mm5); /* wsptr[xxx],[xxx],[2,z13],[3,z13] */
1655 movq_r2r(mm2, mm4);
1657 punpcklwd_r2r(mm7, mm2); /* wsptr[xxx],[xxx],[2,z12],[3,z12] */
1659 punpckhwd_r2r(mm7, mm4); /* wsptr[xxx],[xxx],[2,z10],[3,z10] */
1661 punpckhdq_r2r(mm6, mm4); /*/ wsptr[2,z10],[3,z10],[2,z11],[3,z11] */
1663 punpckhdq_r2r(mm5, mm2); /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */
1664 movq_r2r(mm0, mm5);
1666 punpckldq_r2r(mm4, mm0); /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */
1668 punpckhdq_r2r(mm4, mm5); /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */
1669 movq_r2r(mm3, mm4);
1671 punpckhdq_r2r(mm2, mm4); /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */
1672 movq_r2r(mm5, mm1);
1674 punpckldq_r2r(mm2, mm3); /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */
1675 /* tmp7 = z11 + z13; /* phase 5 */ */
1676 /* tmp8 = z11 - z13; /* phase 5 */ */
1677 psubw_r2r(mm4, mm1); /* tmp8 */
1679 paddw_r2r(mm4, mm5); /* tmp7 */
1680 /* tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ */
1681 psllw_i2r(2, mm1);
1683 psllw_i2r(2, mm0);
1685 pmulhw_m2r(fix_141, mm1); /* tmp21 */
1686 /* tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ */
1687 /* + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ */
1688 psllw_i2r(2, mm3);
1689 movq_r2r(mm0, mm7);
1691 pmulhw_m2r(fix_n184, mm7);
1692 movq_r2r(mm3, mm6);
1694 movq_m2r(*(wsptr), mm2); /* tmp0,final1 */
1696 pmulhw_m2r(fix_108n184, mm6);
1697 /* tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ */
1698 /* + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ */
1699 movq_r2r(mm2, mm4); /* final1 */
1701 pmulhw_m2r(fix_184n261, mm0);
1702 paddw_r2r(mm5, mm2); /* tmp0+tmp7,final1 */
1704 pmulhw_m2r(fix_184, mm3);
1705 psubw_r2r(mm5, mm4); /* tmp0-tmp7,final1 */
1707 /* tmp6 = tmp22 - tmp7; /* phase 2 */ */
1708 psraw_i2r(3, mm2); /* outptr[0,0],[1,0],[2,0],[3,0],final1 */
1710 paddw_r2r(mm6, mm7); /* tmp20 */
1711 psraw_i2r(3, mm4); /* outptr[0,7],[1,7],[2,7],[3,7],final1 */
1713 paddw_r2r(mm0, mm3); /* tmp22 */
1715 /* tmp5 = tmp21 - tmp6; */
1716 psubw_r2r(mm5, mm3); /* tmp6 */
1718 /* tmp4 = tmp20 + tmp5; */
1719 movq_m2r(*(wsptr+1), mm0); /* tmp1,final2 */
1720 psubw_r2r(mm3, mm1); /* tmp5 */
1722 movq_r2r(mm0, mm6); /* final2 */
1723 paddw_r2r(mm3, mm0); /* tmp1+tmp6,final2 */
1725 /* Final output stage: scale down by a factor of 8 and range-limit */
1728 /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
1729 /* & RANGE_MASK]; */
1730 /* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
1731 /* & RANGE_MASK]; final1 */
1734 /* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
1735 /* & RANGE_MASK]; */
1736 /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
1737 /* & RANGE_MASK]; final2 */
1738 psubw_r2r(mm3, mm6); /* tmp1-tmp6,final2 */
1739 psraw_i2r(3, mm0); /* outptr[0,1],[1,1],[2,1],[3,1] */
1741 psraw_i2r(3, mm6); /* outptr[0,6],[1,6],[2,6],[3,6] */
1743 packuswb_r2r(mm4, mm0); /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */
1745 movq_m2r(*(wsptr+2), mm5); /* tmp2,final3 */
1746 packuswb_r2r(mm6, mm2); /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */
1748 /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
1749 /* & RANGE_MASK]; */
1750 /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
1751 /* & RANGE_MASK]; final3 */
1752 paddw_r2r(mm1, mm7); /* tmp4 */
1753 movq_r2r(mm5, mm3);
1755 paddw_r2r(mm1, mm5); /* tmp2+tmp5 */
1756 psubw_r2r(mm1, mm3); /* tmp2-tmp5 */
1758 psraw_i2r(3, mm5); /* outptr[0,2],[1,2],[2,2],[3,2] */
1760 movq_m2r(*(wsptr+3), mm4); /* tmp3,final4 */
1761 psraw_i2r(3, mm3); /* outptr[0,5],[1,5],[2,5],[3,5] */
1765 /* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
1766 /* & RANGE_MASK]; */
1767 /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
1768 /* & RANGE_MASK]; final4 */
1769 movq_r2r(mm4, mm6);
1770 paddw_r2r(mm7, mm4); /* tmp3+tmp4 */
1772 psubw_r2r(mm7, mm6); /* tmp3-tmp4 */
1773 psraw_i2r(3, mm4); /* outptr[0,4],[1,4],[2,4],[3,4] */
1775 /* mov ecx, [dataptr] */
1777 psraw_i2r(3, mm6); /* outptr[0,3],[1,3],[2,3],[3,3] */
1779 packuswb_r2r(mm4, mm5); /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */
1781 packuswb_r2r(mm3, mm6); /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */
1782 movq_r2r(mm2, mm4);
1784 movq_r2r(mm5, mm7);
1785 punpcklbw_r2r(mm0, mm2); /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */
1787 punpckhbw_r2r(mm0, mm4); /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */
1788 movq_r2r(mm2, mm1);
1790 punpcklbw_r2r(mm6, mm5); /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */
1792 /* add dataptr, 4 */
1794 punpckhbw_r2r(mm6, mm7); /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */
1796 punpcklwd_r2r(mm5, mm2); /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */
1798 /* add ecx, output_col */
1800 movq_r2r(mm7, mm6);
1801 punpckhwd_r2r(mm5, mm1); /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */
1803 movq_r2r(mm2, mm0);
1804 punpcklwd_r2r(mm4, mm6); /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */
1806 /* mov idata, [dataptr] */
1808 punpckldq_r2r(mm6, mm2); /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */
1810 /* add dataptr, 4 */
1812 movq_r2r(mm1, mm3);
1814 /* add idata, output_col */
1816 punpckhwd_r2r(mm4, mm7); /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */
1818 movq_r2m(mm2, *(dataptr));
1820 punpckhdq_r2r(mm6, mm0); /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */
1822 dataptr += rskip;
1823 movq_r2m(mm0, *(dataptr));
1825 punpckldq_r2r(mm7, mm1); /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */
1826 punpckhdq_r2r(mm7, mm3); /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */
1828 dataptr += rskip;
1829 movq_r2m(mm1, *(dataptr));
1831 dataptr += rskip;
1832 movq_r2m(mm3, *(dataptr));
1834 /*******************************************************************/
1836 wsptr += 8;
1838 /*******************************************************************/
1840 /* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
1841 /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
1842 /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
1843 /* tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */
1844 movq_m2r(*(wsptr), mm0); /* wsptr[0,0],[0,1],[0,2],[0,3] */
1846 movq_m2r(*(wsptr+1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */
1847 movq_r2r(mm0, mm2);
1849 movq_m2r(*(wsptr+2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */
1850 paddw_r2r(mm1, mm0); /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */
1852 movq_m2r(*(wsptr+3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */
1853 psubw_r2r(mm1, mm2); /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */
1855 movq_r2r(mm0, mm6);
1856 movq_r2r(mm3, mm5);
1858 paddw_r2r(mm4, mm3); /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */
1859 movq_r2r(mm2, mm1);
1861 psubw_r2r(mm4, mm5); /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */
1862 punpcklwd_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */
1864 movq_m2r(*(wsptr+7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */
1865 punpckhwd_r2r(mm3, mm6); /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */
1867 movq_m2r(*(wsptr+4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */
1868 punpckldq_r2r(mm6, mm0); /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
1870 punpcklwd_r2r(mm5, mm1); /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */
1871 movq_r2r(mm3, mm4);
1873 movq_m2r(*(wsptr+6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */
1874 punpckhwd_r2r(mm5, mm2); /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */
1876 movq_m2r(*(wsptr+5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */
1877 punpckldq_r2r(mm2, mm1); /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */
1879 paddw_r2r(mm5, mm3); /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */
1880 movq_r2r(mm6, mm2);
1882 psubw_r2r(mm5, mm4); /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */
1883 paddw_r2r(mm7, mm6); /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */
1885 movq_r2r(mm3, mm5);
1886 punpcklwd_r2r(mm6, mm3); /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */
1888 psubw_r2r(mm7, mm2); /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */
1889 punpckhwd_r2r(mm6, mm5); /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */
1891 movq_r2r(mm4, mm7);
1892 punpckldq_r2r(mm5, mm3); /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */
1894 punpcklwd_r2r(mm2, mm4); /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */
1896 punpckhwd_r2r(mm2, mm7); /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */
1898 punpckldq_r2r(mm7, mm4); /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */
1899 movq_r2r(mm1, mm6);
1901 /*OK */
1903 /* mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
1904 /* mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */
1906 movq_r2r(mm0, mm2);
1907 punpckhdq_r2r(mm4, mm6); /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */
1909 punpckldq_r2r(mm4, mm1); /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */
1910 psllw_i2r(2, mm6);
1912 pmulhw_m2r(fix_141, mm6);
1913 punpckldq_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */
1915 punpckhdq_r2r(mm3, mm2); /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */
1916 movq_r2r(mm0, mm7);
1918 /* tmp0 = tmp10 + tmp13; */
1919 /* tmp3 = tmp10 - tmp13; */
1920 paddw_r2r(mm2, mm0); /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */
1921 psubw_r2r(mm2, mm7); /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */
1923 /* tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */
1924 psubw_r2r(mm2, mm6); /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */
1925 /* tmp1 = tmp11 + tmp12; */
1926 /* tmp2 = tmp11 - tmp12; */
1927 movq_r2r(mm1, mm5);
1929 /*OK */
1932 /* Odd part */
1934 /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
1935 /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
1936 /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
1937 /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
1938 movq_m2r(*(wsptr), mm3); /* wsptr[0,0],[0,1],[0,2],[0,3] */
1939 paddw_r2r(mm6, mm1); /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */
1941 movq_m2r(*(wsptr+1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */
1942 psubw_r2r(mm6, mm5); /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */
1944 movq_r2r(mm3, mm6);
1945 punpckldq_r2r(mm4, mm3); /* wsptr[0,0],[0,1],[0,4],[0,5] */
1947 punpckhdq_r2r(mm6, mm4); /* wsptr[0,6],[0,7],[0,2],[0,3] */
1948 movq_r2r(mm3, mm2);
1950 /*Save tmp0 and tmp1 in wsptr */
1951 movq_r2m(mm0, *(wsptr)); /* save tmp0 */
1952 paddw_r2r(mm4, mm2); /* wsptr[xxx],[0,z11],[xxx],[0,z13] */
1955 /*Continue with z10 --- z13 */
1956 movq_m2r(*(wsptr+2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */
1957 psubw_r2r(mm4, mm3); /* wsptr[xxx],[0,z12],[xxx],[0,z10] */
1959 movq_m2r(*(wsptr+3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */
1960 movq_r2r(mm6, mm4);
1962 movq_r2m(mm1, *(wsptr+1)); /* save tmp1 */
1963 punpckldq_r2r(mm0, mm6); /* wsptr[1,0],[1,1],[1,4],[1,5] */
1965 punpckhdq_r2r(mm4, mm0); /* wsptr[1,6],[1,7],[1,2],[1,3] */
1966 movq_r2r(mm6, mm1);
1968 /*Save tmp2 and tmp3 in wsptr */
1969 paddw_r2r(mm0, mm6); /* wsptr[xxx],[1,z11],[xxx],[1,z13] */
1970 movq_r2r(mm2, mm4);
1972 /*Continue with z10 --- z13 */
1973 movq_r2m(mm5, *(wsptr+2)); /* save tmp2 */
1974 punpcklwd_r2r(mm6, mm2); /* wsptr[xxx],[xxx],[0,z11],[1,z11] */
1976 psubw_r2r(mm0, mm1); /* wsptr[xxx],[1,z12],[xxx],[1,z10] */
1977 punpckhwd_r2r(mm6, mm4); /* wsptr[xxx],[xxx],[0,z13],[1,z13] */
1979 movq_r2r(mm3, mm0);
1980 punpcklwd_r2r(mm1, mm3); /* wsptr[xxx],[xxx],[0,z12],[1,z12] */
1982 movq_r2m(mm7, *(wsptr+3)); /* save tmp3 */
1983 punpckhwd_r2r(mm1, mm0); /* wsptr[xxx],[xxx],[0,z10],[1,z10] */
1985 movq_m2r(*(wsptr+4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */
1986 punpckhdq_r2r(mm2, mm0); /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */
1988 movq_m2r(*(wsptr+5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */
1989 punpckhdq_r2r(mm4, mm3); /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */
1991 movq_m2r(*(wsptr+6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */
1992 movq_r2r(mm6, mm4);
1994 punpckldq_r2r(mm7, mm6); /* wsptr[2,0],[2,1],[2,4],[2,5] */
1995 movq_r2r(mm1, mm5);
1997 punpckhdq_r2r(mm4, mm7); /* wsptr[2,6],[2,7],[2,2],[2,3] */
1998 movq_r2r(mm6, mm2);
2000 movq_m2r(*(wsptr+7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */
2001 paddw_r2r(mm7, mm6); /* wsptr[xxx],[2,z11],[xxx],[2,z13] */
2003 psubw_r2r(mm7, mm2); /* wsptr[xxx],[2,z12],[xxx],[2,z10] */
2004 punpckldq_r2r(mm4, mm1); /* wsptr[3,0],[3,1],[3,4],[3,5] */
2006 punpckhdq_r2r(mm5, mm4); /* wsptr[3,6],[3,7],[3,2],[3,3] */
2007 movq_r2r(mm1, mm7);
2009 paddw_r2r(mm4, mm1); /* wsptr[xxx],[3,z11],[xxx],[3,z13] */
2010 psubw_r2r(mm4, mm7); /* wsptr[xxx],[3,z12],[xxx],[3,z10] */
2012 movq_r2r(mm6, mm5);
2013 punpcklwd_r2r(mm1, mm6); /* wsptr[xxx],[xxx],[2,z11],[3,z11] */
2015 punpckhwd_r2r(mm1, mm5); /* wsptr[xxx],[xxx],[2,z13],[3,z13] */
2016 movq_r2r(mm2, mm4);
2018 punpcklwd_r2r(mm7, mm2); /* wsptr[xxx],[xxx],[2,z12],[3,z12] */
2020 punpckhwd_r2r(mm7, mm4); /* wsptr[xxx],[xxx],[2,z10],[3,z10] */
2022 punpckhdq_r2r(mm6, mm4); /* wsptr[2,z10],[3,z10],[2,z11],[3,z11] */
2024 punpckhdq_r2r(mm5, mm2); /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */
2025 movq_r2r(mm0, mm5);
2027 punpckldq_r2r(mm4, mm0); /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */
2029 punpckhdq_r2r(mm4, mm5); /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */
2030 movq_r2r(mm3, mm4);
2032 punpckhdq_r2r(mm2, mm4); /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */
2033 movq_r2r(mm5, mm1);
2035 punpckldq_r2r(mm2, mm3); /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */
2036 /* tmp7 = z11 + z13; /* phase 5 */ */
2037 /* tmp8 = z11 - z13; /* phase 5 */ */
2038 psubw_r2r(mm4, mm1); /* tmp8 */
2040 paddw_r2r(mm4, mm5); /* tmp7 */
2041 /* tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ */
2042 psllw_i2r(2, mm1);
2044 psllw_i2r(2, mm0);
2046 pmulhw_m2r(fix_141, mm1); /* tmp21 */
2047 /* tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ */
2048 /* + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ */
2049 psllw_i2r(2, mm3);
2050 movq_r2r(mm0, mm7);
2052 pmulhw_m2r(fix_n184, mm7);
2053 movq_r2r(mm3, mm6);
2055 movq_m2r(*(wsptr), mm2); /* tmp0,final1 */
2057 pmulhw_m2r(fix_108n184, mm6);
2058 /* tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ */
2059 /* + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ */
2060 movq_r2r(mm2, mm4); /* final1 */
2062 pmulhw_m2r(fix_184n261, mm0);
2063 paddw_r2r(mm5, mm2); /* tmp0+tmp7,final1 */
2065 pmulhw_m2r(fix_184, mm3);
2066 psubw_r2r(mm5, mm4); /* tmp0-tmp7,final1 */
2068 /* tmp6 = tmp22 - tmp7; /* phase 2 */ */
2069 psraw_i2r(3, mm2); /* outptr[0,0],[1,0],[2,0],[3,0],final1 */
2071 paddw_r2r(mm6, mm7); /* tmp20 */
2072 psraw_i2r(3, mm4); /* outptr[0,7],[1,7],[2,7],[3,7],final1 */
2074 paddw_r2r(mm0, mm3); /* tmp22 */
2076 /* tmp5 = tmp21 - tmp6; */
2077 psubw_r2r(mm5, mm3); /* tmp6 */
2079 /* tmp4 = tmp20 + tmp5; */
2080 movq_m2r(*(wsptr+1), mm0); /* tmp1,final2 */
2081 psubw_r2r(mm3, mm1); /* tmp5 */
2083 movq_r2r(mm0, mm6); /* final2 */
2084 paddw_r2r(mm3, mm0); /* tmp1+tmp6,final2 */
2086 /* Final output stage: scale down by a factor of 8 and range-limit */
2088 /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
2089 /* & RANGE_MASK]; */
2090 /* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
2091 /* & RANGE_MASK]; final1 */
2094 /* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
2095 /* & RANGE_MASK]; */
2096 /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
2097 /* & RANGE_MASK]; final2 */
2098 psubw_r2r(mm3, mm6); /* tmp1-tmp6,final2 */
2099 psraw_i2r(3, mm0); /* outptr[0,1],[1,1],[2,1],[3,1] */
2101 psraw_i2r(3, mm6); /* outptr[0,6],[1,6],[2,6],[3,6] */
2103 packuswb_r2r(mm4, mm0); /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */
2105 movq_m2r(*(wsptr+2), mm5); /* tmp2,final3 */
2106 packuswb_r2r(mm6, mm2); /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */
2108 /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
2109 /* & RANGE_MASK]; */
2110 /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
2111 /* & RANGE_MASK]; final3 */
2112 paddw_r2r(mm1, mm7); /* tmp4 */
2113 movq_r2r(mm5, mm3);
2115 paddw_r2r(mm1, mm5); /* tmp2+tmp5 */
2116 psubw_r2r(mm1, mm3); /* tmp2-tmp5 */
2118 psraw_i2r(3, mm5); /* outptr[0,2],[1,2],[2,2],[3,2] */
2120 movq_m2r(*(wsptr+3), mm4); /* tmp3,final4 */
2121 psraw_i2r(3, mm3); /* outptr[0,5],[1,5],[2,5],[3,5] */
2125 /* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
2126 /* & RANGE_MASK]; */
2127 /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
2128 /* & RANGE_MASK]; final4 */
2129 movq_r2r(mm4, mm6);
2130 paddw_r2r(mm7, mm4); /* tmp3+tmp4 */
2132 psubw_r2r(mm7, mm6); /* tmp3-tmp4 */
2133 psraw_i2r(3, mm4); /* outptr[0,4],[1,4],[2,4],[3,4] */
2135 psraw_i2r(3, mm6); /* outptr[0,3],[1,3],[2,3],[3,3] */
2138 movq_r2m(mm4, *dummy);
2139 fprintf(stderr, "3-4 %016llx\n", dummy);
2140 movq_r2m(mm4, *dummy);
2141 fprintf(stderr, "3+4 %016llx\n", dummy);
2145 packuswb_r2r(mm4, mm5); /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */
2147 packuswb_r2r(mm3, mm6); /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */
2148 movq_r2r(mm2, mm4);
2150 movq_r2r(mm5, mm7);
2151 punpcklbw_r2r(mm0, mm2); /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */
2153 punpckhbw_r2r(mm0, mm4); /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */
2154 movq_r2r(mm2, mm1);
2156 punpcklbw_r2r(mm6, mm5); /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */
2158 punpckhbw_r2r(mm6, mm7); /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */
2160 punpcklwd_r2r(mm5, mm2); /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */
2162 movq_r2r(mm7, mm6);
2163 punpckhwd_r2r(mm5, mm1); /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */
2165 movq_r2r(mm2, mm0);
2166 punpcklwd_r2r(mm4, mm6); /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */
2168 punpckldq_r2r(mm6, mm2); /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */
2170 movq_r2r(mm1, mm3);
2172 punpckhwd_r2r(mm4, mm7); /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */
2174 dataptr += rskip;
2175 movq_r2m(mm2, *(dataptr));
2177 punpckhdq_r2r(mm6, mm0); /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */
2179 dataptr += rskip;
2180 movq_r2m(mm0, *(dataptr));
2182 punpckldq_r2r(mm7, mm1); /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */
2184 punpckhdq_r2r(mm7, mm3); /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */
2186 dataptr += rskip;
2187 movq_r2m(mm1, *(dataptr));
2189 dataptr += rskip;
2190 movq_r2m(mm3, *(dataptr));
2192 #else
2193 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2194 __s32 tmp10, tmp11, tmp12, tmp13;
2195 __s32 z5, z10, z11, z12, z13;
2196 __s16 *inptr;
2197 __s32 *wsptr;
2198 __u8 *outptr;
2199 int ctr;
2200 __s32 dcval;
2201 __s32 workspace[64];
2203 inptr = data;
2204 wsptr = workspace;
2205 for (ctr = 8; ctr > 0; ctr--) {
2207 if ((inptr[8] | inptr[16] | inptr[24] |
2208 inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2209 dcval = inptr[0];
2210 wsptr[0] = dcval;
2211 wsptr[8] = dcval;
2212 wsptr[16] = dcval;
2213 wsptr[24] = dcval;
2214 wsptr[32] = dcval;
2215 wsptr[40] = dcval;
2216 wsptr[48] = dcval;
2217 wsptr[56] = dcval;
2219 inptr++;
2220 wsptr++;
2221 continue;
2224 tmp0 = inptr[0];
2225 tmp1 = inptr[16];
2226 tmp2 = inptr[32];
2227 tmp3 = inptr[48];
2229 tmp10 = tmp0 + tmp2;
2230 tmp11 = tmp0 - tmp2;
2232 tmp13 = tmp1 + tmp3;
2233 tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2235 tmp0 = tmp10 + tmp13;
2236 tmp3 = tmp10 - tmp13;
2237 tmp1 = tmp11 + tmp12;
2238 tmp2 = tmp11 - tmp12;
2240 tmp4 = inptr[8];
2241 tmp5 = inptr[24];
2242 tmp6 = inptr[40];
2243 tmp7 = inptr[56];
2245 z13 = tmp6 + tmp5;
2246 z10 = tmp6 - tmp5;
2247 z11 = tmp4 + tmp7;
2248 z12 = tmp4 - tmp7;
2250 tmp7 = z11 + z13;
2251 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2253 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2254 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2255 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2257 tmp6 = tmp12 - tmp7;
2258 tmp5 = tmp11 - tmp6;
2259 tmp4 = tmp10 + tmp5;
2261 wsptr[0] = (__s32) (tmp0 + tmp7);
2262 wsptr[56] = (__s32) (tmp0 - tmp7);
2263 wsptr[8] = (__s32) (tmp1 + tmp6);
2264 wsptr[48] = (__s32) (tmp1 - tmp6);
2265 wsptr[16] = (__s32) (tmp2 + tmp5);
2266 wsptr[40] = (__s32) (tmp2 - tmp5);
2267 wsptr[32] = (__s32) (tmp3 + tmp4);
2268 wsptr[24] = (__s32) (tmp3 - tmp4);
2270 inptr++;
2271 wsptr++;
2274 wsptr = workspace;
2275 for (ctr = 0; ctr < 8; ctr++) {
2276 outptr = &(odata[ctr*rskip]);
2278 tmp10 = wsptr[0] + wsptr[4];
2279 tmp11 = wsptr[0] - wsptr[4];
2281 tmp13 = wsptr[2] + wsptr[6];
2282 tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2284 tmp0 = tmp10 + tmp13;
2285 tmp3 = tmp10 - tmp13;
2286 tmp1 = tmp11 + tmp12;
2287 tmp2 = tmp11 - tmp12;
2289 z13 = wsptr[5] + wsptr[3];
2290 z10 = wsptr[5] - wsptr[3];
2291 z11 = wsptr[1] + wsptr[7];
2292 z12 = wsptr[1] - wsptr[7];
2294 tmp7 = z11 + z13;
2295 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2297 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2298 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2299 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2301 tmp6 = tmp12 - tmp7;
2302 tmp5 = tmp11 - tmp6;
2303 tmp4 = tmp10 + tmp5;
2305 outptr[0] = RL(DESCALE(tmp0 + tmp7));
2306 outptr[7] = RL(DESCALE(tmp0 - tmp7));
2307 outptr[1] = RL(DESCALE(tmp1 + tmp6));
2308 outptr[6] = RL(DESCALE(tmp1 - tmp6));
2309 outptr[2] = RL(DESCALE(tmp2 + tmp5));
2310 outptr[5] = RL(DESCALE(tmp2 - tmp5));
2311 outptr[4] = RL(DESCALE(tmp3 + tmp4));
2312 outptr[3] = RL(DESCALE(tmp3 - tmp4));
2314 wsptr += 8;
2316 #endif
2320 Main Routines
2322 This file contains most of the initialisation and control functions
2324 (C) Justin Schoeman 1998
2330 Private function
2332 Initialise all the cache-aliged data blocks
2336 void RTjpeg_init_data(void)
2338 unsigned long dptr;
2340 dptr=(unsigned long)&(RTjpeg_alldata[0]);
2341 dptr+=32;
2342 dptr=dptr>>5;
2343 dptr=dptr<<5; /* cache align data */
2345 RTjpeg_block=(__s16 *)dptr;
2346 dptr+=sizeof(__s16)*64;
2347 RTjpeg_lqt=(__s32 *)dptr;
2348 dptr+=sizeof(__s32)*64;
2349 RTjpeg_cqt=(__s32 *)dptr;
2350 dptr+=sizeof(__s32)*64;
2351 RTjpeg_liqt=(__u32 *)dptr;
2352 dptr+=sizeof(__u32)*64;
2353 RTjpeg_ciqt=(__u32 *)dptr;
2358 External Function
2360 Re-set quality factor
2362 Input: buf -> pointer to 128 ints for quant values store to pass back to
2363 init_decompress.
2364 Q -> quality factor (192=best, 32=worst)
2367 void RTjpeg_init_Q(__u8 Q)
2369 int i;
2370 __u64 qual;
2372 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2374 for(i=0; i<64; i++)
2376 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2377 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
2378 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2379 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
2380 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
2381 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
2382 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
2383 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
2386 RTjpeg_lb8=0;
2387 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2388 RTjpeg_lb8--;
2389 RTjpeg_cb8=0;
2390 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2391 RTjpeg_cb8--;
2393 RTjpeg_dct_init();
2394 RTjpeg_idct_init();
2395 RTjpeg_quant_init();
2400 External Function
2402 Initialise compression.
2404 Input: buf -> pointer to 128 ints for quant values store to pass back to
2405 init_decompress.
2406 width -> width of image
2407 height -> height of image
2408 Q -> quality factor (192=best, 32=worst)
2412 void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q)
2414 int i;
2415 __u64 qual;
2417 RTjpeg_init_data();
2419 RTjpeg_width=width;
2420 RTjpeg_height=height;
2421 RTjpeg_Ywidth = RTjpeg_width>>3;
2422 RTjpeg_Ysize=width * height;
2423 RTjpeg_Cwidth = RTjpeg_width>>4;
2424 RTjpeg_Csize= (width>>1) * height;
2426 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2428 for(i=0; i<64; i++)
2430 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2431 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
2432 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2433 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
2434 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
2435 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
2436 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
2437 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
2440 RTjpeg_lb8=0;
2441 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2442 RTjpeg_lb8--;
2443 RTjpeg_cb8=0;
2444 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2445 RTjpeg_cb8--;
2447 RTjpeg_dct_init();
2448 RTjpeg_quant_init();
2450 for(i=0; i<64; i++)
2451 buf[i]=RTjpeg_liqt[i];
2452 for(i=0; i<64; i++)
2453 buf[64+i]=RTjpeg_ciqt[i];
2456 void RTjpeg_init_decompress(__u32 *buf, int width, int height)
2458 int i;
2460 RTjpeg_init_data();
2462 RTjpeg_width=width;
2463 RTjpeg_height=height;
2464 RTjpeg_Ywidth = RTjpeg_width>>3;
2465 RTjpeg_Ysize=width * height;
2466 RTjpeg_Cwidth = RTjpeg_width>>4;
2467 RTjpeg_Csize= (width>>1) * height;
2469 for(i=0; i<64; i++)
2471 RTjpeg_liqt[i]=buf[i];
2472 RTjpeg_ciqt[i]=buf[i+64];
2475 RTjpeg_lb8=0;
2476 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2477 RTjpeg_lb8--;
2478 RTjpeg_cb8=0;
2479 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2480 RTjpeg_cb8--;
2482 RTjpeg_idct_init();
2484 /* RTjpeg_color_init(); */
2487 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp)
2489 __s8 * sb;
2490 register __s8 * bp1 = bp + (RTjpeg_width<<3);
2491 register __s8 * bp2 = bp + RTjpeg_Ysize;
2492 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
2493 register int i, j, k;
2495 #ifdef USE_MMX
2496 emms();
2497 #endif
2498 sb=sp;
2499 /* Y */
2500 for(i=RTjpeg_height>>1; i; i-=8)
2502 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2504 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2505 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2506 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2508 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2509 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2510 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2512 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
2513 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2514 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2516 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
2517 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2518 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2520 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2521 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2522 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2524 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2525 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2526 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2529 bp+=RTjpeg_width<<4;
2530 bp1+=RTjpeg_width<<4;
2531 bp2+=RTjpeg_width<<2;
2532 bp3+=RTjpeg_width<<2;
2535 #ifdef USE_MMX
2536 emms();
2537 #endif
2538 return (sp-sb);
2541 int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp)
2543 __s8 * sb;
2544 register __s8 * bp2 = bp + RTjpeg_Ysize;
2545 register __s8 * bp3 = bp2 + RTjpeg_Csize;
2546 register int i, j, k;
2548 #ifdef USE_MMX
2549 emms();
2550 #endif
2551 sb=sp;
2552 /* Y */
2553 for(i=RTjpeg_height; i; i-=8)
2555 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2557 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2558 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2559 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2561 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2562 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2563 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2565 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2566 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2567 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2569 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2570 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2571 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2574 bp+=RTjpeg_width<<3;
2575 bp2+=RTjpeg_width<<2;
2576 bp3+=RTjpeg_width<<2;
2579 #ifdef USE_MMX
2580 emms();
2581 #endif
2582 return (sp-sb);
2585 int RTjpeg_compress8(__s8 *sp, unsigned char *bp)
2587 __s8 * sb;
2588 int i, j;
2590 #ifdef USE_MMX
2591 emms();
2592 #endif
2594 sb=sp;
2595 /* Y */
2596 for(i=0; i<RTjpeg_height; i+=8)
2598 for(j=0; j<RTjpeg_width; j+=8)
2600 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
2601 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2602 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2604 bp+=RTjpeg_width;
2607 #ifdef USE_MMX
2608 emms();
2609 #endif
2610 return (sp-sb);
2613 void RTjpeg_decompressYUV422(__s8 *sp, __u8 *bp)
2615 register __s8 * bp2 = bp + RTjpeg_Ysize;
2616 register __s8 * bp3 = bp2 + (RTjpeg_Csize);
2617 int i, j,k;
2619 #ifdef USE_MMX
2620 emms();
2621 #endif
2623 /* Y */
2624 for(i=RTjpeg_height; i; i-=8)
2626 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
2627 if(*sp==-1)sp++;
2628 else
2630 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
2631 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
2633 if(*sp==-1)sp++;
2634 else
2636 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
2637 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
2639 if(*sp==-1)sp++;
2640 else
2642 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
2643 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
2645 if(*sp==-1)sp++;
2646 else
2648 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
2649 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
2652 bp+=RTjpeg_width<<3;
2653 bp2+=RTjpeg_width<<2;
2654 bp3+=RTjpeg_width<<2;
2656 #ifdef USE_MMX
2657 emms();
2658 #endif
2661 void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp)
2663 register __s8 * bp1 = bp + (RTjpeg_width<<3);
2664 register __s8 * bp2 = bp + RTjpeg_Ysize;
2665 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
2666 int i, j,k;
2668 #ifdef USE_MMX
2669 emms();
2670 #endif
2672 /* Y */
2673 for(i=RTjpeg_height>>1; i; i-=8)
2675 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
2676 if(*sp==-1)sp++;
2677 else
2679 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
2680 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
2682 if(*sp==-1)sp++;
2683 else
2685 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
2686 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
2688 if(*sp==-1)sp++;
2689 else
2691 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
2692 RTjpeg_idct(bp1+j, RTjpeg_block, RTjpeg_width);
2694 if(*sp==-1)sp++;
2695 else
2697 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
2698 RTjpeg_idct(bp1+j+8, RTjpeg_block, RTjpeg_width);
2700 if(*sp==-1)sp++;
2701 else
2703 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
2704 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
2706 if(*sp==-1)sp++;
2707 else
2709 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
2710 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
2713 bp+=RTjpeg_width<<4;
2714 bp1+=RTjpeg_width<<4;
2715 bp2+=RTjpeg_width<<2;
2716 bp3+=RTjpeg_width<<2;
2718 #ifdef USE_MMX
2719 emms();
2720 #endif
2723 void RTjpeg_decompress8(__s8 *sp, __u8 *bp)
2725 int i, j;
2727 #ifdef USE_MMX
2728 emms();
2729 #endif
2731 /* Y */
2732 for(i=0; i<RTjpeg_height; i+=8)
2734 for(j=0; j<RTjpeg_width; j+=8)
2735 if(*sp==-1)sp++;
2736 else
2738 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
2739 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
2741 bp+=RTjpeg_width<<3;
2746 External Function
2748 Initialise additional data structures for motion compensation
2752 void RTjpeg_init_mcompress(void)
2754 unsigned long tmp;
2756 if(!RTjpeg_old)
2758 RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32);
2759 tmp=(unsigned long)RTjpeg_old;
2760 tmp+=32;
2761 tmp=tmp>>5;
2762 RTjpeg_old=(__s16 *)(tmp<<5);
2764 if (!RTjpeg_old)
2766 fprintf(stderr, "RTjpeg: Could not allocate memory\n");
2767 exit(-1);
2769 bzero(RTjpeg_old, ((4*RTjpeg_width*RTjpeg_height)));
2772 #ifdef USE_MMX
2774 int RTjpeg_bcomp(__s16 *old, mmx_t *mask)
2776 int i;
2777 mmx_t *mold=(mmx_t *)old;
2778 mmx_t *mblock=(mmx_t *)RTjpeg_block;
2779 mmx_t result;
2780 static mmx_t neg=(mmx_t)(unsigned long long)0xffffffffffffffffULL;
2782 movq_m2r(*mask, mm7);
2783 movq_m2r(neg, mm6);
2784 pxor_r2r(mm5, mm5);
2786 for(i=0; i<8; i++)
2788 movq_m2r(*(mblock++), mm0);
2789 movq_m2r(*(mblock++), mm2);
2790 movq_m2r(*(mold++), mm1);
2791 movq_m2r(*(mold++), mm3);
2792 psubsw_r2r(mm1, mm0);
2793 psubsw_r2r(mm3, mm2);
2794 movq_r2r(mm0, mm1);
2795 movq_r2r(mm2, mm3);
2796 pcmpgtw_r2r(mm7, mm0);
2797 pcmpgtw_r2r(mm7, mm2);
2798 pxor_r2r(mm6, mm1);
2799 pxor_r2r(mm6, mm3);
2800 pcmpgtw_r2r(mm7, mm1);
2801 pcmpgtw_r2r(mm7, mm3);
2802 por_r2r(mm0, mm5);
2803 por_r2r(mm2, mm5);
2804 por_r2r(mm1, mm5);
2805 por_r2r(mm3, mm5);
2807 movq_r2m(mm5, result);
2809 if(result.q)
2811 if(!RTjpeg_mtest)
2812 for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
2813 return 0;
2815 /* printf("."); */
2816 return 1;
2819 #else
2820 int RTjpeg_bcomp(__s16 *old, __u16 *mask)
2822 int i;
2824 for(i=0; i<64; i++)
2825 if(abs(old[i]-RTjpeg_block[i])>*mask)
2827 if(!RTjpeg_mtest)
2828 for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
2829 return 0;
2831 return 1;
2833 #endif
2835 void RTjpeg_set_test(int i)
2837 RTjpeg_mtest=i;
2840 int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
2842 __s8 * sb;
2843 __s16 *block;
2844 register __s8 * bp1 = bp + (RTjpeg_width<<3);
2845 register __s8 * bp2 = bp + RTjpeg_Ysize;
2846 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
2847 register int i, j, k;
2849 #ifdef USE_MMX
2850 emms();
2851 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
2852 RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask);
2853 #else
2854 RTjpeg_lmask=lmask;
2855 RTjpeg_cmask=cmask;
2856 #endif
2858 sb=sp;
2859 block=RTjpeg_old;
2860 /* Y */
2861 for(i=RTjpeg_height>>1; i; i-=8)
2863 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2865 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2866 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2867 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
2869 *((__u8 *)sp++)=255;
2871 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2872 block+=64;
2874 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2875 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2876 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
2878 *((__u8 *)sp++)=255;
2880 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2881 block+=64;
2883 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
2884 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2885 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
2887 *((__u8 *)sp++)=255;
2889 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2890 block+=64;
2892 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
2893 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2894 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
2896 *((__u8 *)sp++)=255;
2898 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2899 block+=64;
2901 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2902 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2903 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
2905 *((__u8 *)sp++)=255;
2907 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2908 block+=64;
2910 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2911 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2912 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
2914 *((__u8 *)sp++)=255;
2916 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2917 block+=64;
2919 bp+=RTjpeg_width<<4;
2920 bp1+=RTjpeg_width<<4;
2921 bp2+=RTjpeg_width<<2;
2922 bp3+=RTjpeg_width<<2;
2925 #ifdef USE_MMX
2926 emms();
2927 #endif
2928 return (sp-sb);
2932 int RTjpeg_mcompressYUV422(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
2934 __s8 * sb;
2935 __s16 *block;
2936 register __s8 * bp2;
2937 register __s8 * bp3;
2938 register int i, j, k;
2940 #ifdef USE_MMX
2941 emms();
2942 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
2943 RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask);
2944 #else
2945 RTjpeg_lmask=lmask;
2946 RTjpeg_cmask=cmask;
2947 #endif
2949 bp = bp - RTjpeg_width*0;
2950 bp2 = bp + RTjpeg_Ysize-RTjpeg_width*0;
2951 bp3 = bp2 + RTjpeg_Csize;
2953 sb=sp;
2954 block=RTjpeg_old;
2955 /* Y */
2956 for(i=RTjpeg_height; i; i-=8)
2958 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2960 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2961 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2962 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
2964 *((__u8 *)sp++)=255;
2966 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2967 block+=64;
2969 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2970 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2971 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
2973 *((__u8 *)sp++)=255;
2975 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2976 block+=64;
2978 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2979 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2980 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
2982 *((__u8 *)sp++)=255;
2984 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2985 block+=64;
2987 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2988 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2989 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
2991 *((__u8 *)sp++)=255;
2993 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2994 block+=64;
2997 bp+=RTjpeg_width<<3;
2998 bp2+=RTjpeg_width<<2;
2999 bp3+=RTjpeg_width<<2;
3001 printf ("%d\n", block - RTjpeg_old);
3002 #ifdef USE_MMX
3003 emms();
3004 #endif
3005 return (sp-sb);
3008 int RTjpeg_mcompress8(__s8 *sp, unsigned char *bp, __u16 lmask)
3010 __s8 * sb;
3011 __s16 *block;
3012 int i, j;
3014 #ifdef USE_MMX
3015 emms();
3016 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
3017 #else
3018 RTjpeg_lmask=lmask;
3019 #endif
3022 sb=sp;
3023 block=RTjpeg_old;
3024 /* Y */
3025 for(i=0; i<RTjpeg_height; i+=8)
3027 for(j=0; j<RTjpeg_width; j+=8)
3029 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
3030 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3031 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3033 *((__u8 *)sp++)=255;
3034 /* printf("* %d ", sp[-1]); */
3035 } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3036 block+=64;
3038 bp+=RTjpeg_width<<3;
3040 #ifdef USE_MMX
3041 emms();
3042 #endif
3043 return (sp-sb);
3046 void RTjpeg_color_init(void)
3050 #define KcrR 76284
3051 #define KcrG 53281
3052 #define KcbG 25625
3053 #define KcbB 132252
3054 #define Ky 76284
3056 void RTjpeg_yuv422rgb(__u8 *buf, __u8 *rgb, int stride)
3058 int tmp;
3059 int i, j;
3060 __s32 y, crR, crG, cbG, cbB;
3061 __u8 *bufcr, *bufcb, *bufy, *bufoute;
3062 int yskip;
3064 yskip=RTjpeg_width;
3066 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3067 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
3068 bufy=&buf[0];
3069 bufoute=rgb;
3071 for(i=0; i<(RTjpeg_height); i++)
3073 for(j=0; j<RTjpeg_width; j+=2)
3075 crR=(*bufcr-128)*KcrR;
3076 crG=(*(bufcr++)-128)*KcrG;
3077 cbG=(*bufcb-128)*KcbG;
3078 cbB=(*(bufcb++)-128)*KcbB;
3080 y=(bufy[j]-16)*Ky;
3082 tmp=(y+crR)>>16;
3083 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3084 tmp=(y-crG-cbG)>>16;
3085 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3086 tmp=(y+cbB)>>16;
3087 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3089 y=(bufy[j+1]-16)*Ky;
3091 tmp=(y+crR)>>16;
3092 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3093 tmp=(y-crG-cbG)>>16;
3094 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3095 tmp=(y+cbB)>>16;
3096 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3099 bufy+=yskip;
3104 void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb, int stride)
3106 int tmp;
3107 int i, j;
3108 __s32 y, crR, crG, cbG, cbB;
3109 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3110 int oskip, yskip;
3112 if(stride==0)
3113 oskip=RTjpeg_width*3;
3114 else
3115 oskip=2*stride-RTjpeg_width*3;
3117 yskip=RTjpeg_width;
3119 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3120 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3121 bufy=&buf[0];
3122 bufoute=rgb;
3123 bufouto=rgb+RTjpeg_width*3;
3125 for(i=0; i<(RTjpeg_height>>1); i++)
3127 for(j=0; j<RTjpeg_width; j+=2)
3129 crR=(*bufcr-128)*KcrR;
3130 crG=(*(bufcr++)-128)*KcrG;
3131 cbG=(*bufcb-128)*KcbG;
3132 cbB=(*(bufcb++)-128)*KcbB;
3134 y=(bufy[j]-16)*Ky;
3136 tmp=(y+crR)>>16;
3137 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3138 tmp=(y-crG-cbG)>>16;
3139 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3140 tmp=(y+cbB)>>16;
3141 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3143 y=(bufy[j+1]-16)*Ky;
3145 tmp=(y+crR)>>16;
3146 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3147 tmp=(y-crG-cbG)>>16;
3148 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3149 tmp=(y+cbB)>>16;
3150 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3152 y=(bufy[j+yskip]-16)*Ky;
3154 tmp=(y+crR)>>16;
3155 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3156 tmp=(y-crG-cbG)>>16;
3157 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3158 tmp=(y+cbB)>>16;
3159 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3161 y=(bufy[j+1+yskip]-16)*Ky;
3163 tmp=(y+crR)>>16;
3164 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3165 tmp=(y-crG-cbG)>>16;
3166 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3167 tmp=(y+cbB)>>16;
3168 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3171 bufoute+=oskip;
3172 bufouto+=oskip;
3173 bufy+=yskip<<1;
3178 void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb, int stride)
3180 int tmp;
3181 int i, j;
3182 __s32 y, crR, crG, cbG, cbB;
3183 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3184 int oskip, yskip;
3186 if(stride==0)
3187 oskip=RTjpeg_width*4;
3188 else
3189 oskip = 2*stride-RTjpeg_width*4;
3190 yskip=RTjpeg_width;
3192 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3193 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
3194 bufy=&buf[0];
3195 bufoute=rgb;
3196 bufouto=rgb+RTjpeg_width*4;
3198 for(i=0; i<(RTjpeg_height>>1); i++)
3200 for(j=0; j<RTjpeg_width; j+=2)
3202 crR=(*bufcr-128)*KcrR;
3203 crG=(*(bufcr++)-128)*KcrG;
3204 cbG=(*bufcb-128)*KcbG;
3205 cbB=(*(bufcb++)-128)*KcbB;
3207 y=(bufy[j]-16)*Ky;
3209 tmp=(y+cbB)>>16;
3210 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3211 tmp=(y-crG-cbG)>>16;
3212 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3213 tmp=(y+crR)>>16;
3214 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3215 bufoute++;
3217 y=(bufy[j+1]-16)*Ky;
3219 tmp=(y+cbB)>>16;
3220 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3221 tmp=(y-crG-cbG)>>16;
3222 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3223 tmp=(y+crR)>>16;
3224 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3225 bufoute++;
3227 y=(bufy[j+yskip]-16)*Ky;
3229 tmp=(y+cbB)>>16;
3230 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3231 tmp=(y-crG-cbG)>>16;
3232 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3233 tmp=(y+crR)>>16;
3234 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3235 bufouto++;
3237 y=(bufy[j+1+yskip]-16)*Ky;
3239 tmp=(y+cbB)>>16;
3240 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3241 tmp=(y-crG-cbG)>>16;
3242 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3243 tmp=(y+crR)>>16;
3244 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3245 bufouto++;
3248 bufoute+=oskip;
3249 bufouto+=oskip;
3250 bufy+=yskip<<1;
3254 void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb, int stride)
3256 int tmp;
3257 int i, j;
3258 __s32 y, crR, crG, cbG, cbB;
3259 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3260 int oskip, yskip;
3262 if(stride==0)
3263 oskip=RTjpeg_width*3;
3264 else
3265 oskip=2*stride - RTjpeg_width*3;
3267 yskip=RTjpeg_width;
3269 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3270 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3271 bufy=&buf[0];
3272 bufoute=rgb;
3273 bufouto=rgb+RTjpeg_width*3;
3275 for(i=0; i<(RTjpeg_height>>1); i++)
3277 for(j=0; j<RTjpeg_width; j+=2)
3279 crR=(*bufcr-128)*KcrR;
3280 crG=(*(bufcr++)-128)*KcrG;
3281 cbG=(*bufcb-128)*KcbG;
3282 cbB=(*(bufcb++)-128)*KcbB;
3284 y=(bufy[j]-16)*Ky;
3286 tmp=(y+cbB)>>16;
3287 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3288 tmp=(y-crG-cbG)>>16;
3289 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3290 tmp=(y+crR)>>16;
3291 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3293 y=(bufy[j+1]-16)*Ky;
3295 tmp=(y+cbB)>>16;
3296 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3297 tmp=(y-crG-cbG)>>16;
3298 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3299 tmp=(y+crR)>>16;
3300 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3302 y=(bufy[j+yskip]-16)*Ky;
3304 tmp=(y+cbB)>>16;
3305 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3306 tmp=(y-crG-cbG)>>16;
3307 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3308 tmp=(y+crR)>>16;
3309 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3311 y=(bufy[j+1+yskip]-16)*Ky;
3313 tmp=(y+cbB)>>16;
3314 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3315 tmp=(y-crG-cbG)>>16;
3316 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3317 tmp=(y+crR)>>16;
3318 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3321 bufoute+=oskip;
3322 bufouto+=oskip;
3323 bufy+=yskip<<1;
3327 void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb, int stride)
3329 int tmp;
3330 int i, j;
3331 __s32 y, crR, crG, cbG, cbB;
3332 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3333 int oskip, yskip;
3334 unsigned char r, g, b;
3336 if(stride==0)
3337 oskip=RTjpeg_width*2;
3338 else
3339 oskip=2*stride-RTjpeg_width*2;
3341 yskip=RTjpeg_width;
3343 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3344 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3345 bufy=&buf[0];
3346 bufoute=rgb;
3347 bufouto=rgb+RTjpeg_width*2;
3349 for(i=0; i<(RTjpeg_height>>1); i++)
3351 for(j=0; j<RTjpeg_width; j+=2)
3353 crR=(*bufcr-128)*KcrR;
3354 crG=(*(bufcr++)-128)*KcrG;
3355 cbG=(*bufcb-128)*KcbG;
3356 cbB=(*(bufcb++)-128)*KcbB;
3358 y=(bufy[j]-16)*Ky;
3360 tmp=(y+cbB)>>16;
3361 b=(tmp>255)?255:((tmp<0)?0:tmp);
3362 tmp=(y-crG-cbG)>>16;
3363 g=(tmp>255)?255:((tmp<0)?0:tmp);
3364 tmp=(y+crR)>>16;
3365 r=(tmp>255)?255:((tmp<0)?0:tmp);
3366 tmp=(int)((int)b >> 3);
3367 tmp|=(int)(((int)g >> 2) << 5);
3368 tmp|=(int)(((int)r >> 3) << 11);
3369 *(bufoute++)=tmp&0xff;
3370 *(bufoute++)=tmp>>8;
3373 y=(bufy[j+1]-16)*Ky;
3375 tmp=(y+cbB)>>16;
3376 b=(tmp>255)?255:((tmp<0)?0:tmp);
3377 tmp=(y-crG-cbG)>>16;
3378 g=(tmp>255)?255:((tmp<0)?0:tmp);
3379 tmp=(y+crR)>>16;
3380 r=(tmp>255)?255:((tmp<0)?0:tmp);
3381 tmp=(int)((int)b >> 3);
3382 tmp|=(int)(((int)g >> 2) << 5);
3383 tmp|=(int)(((int)r >> 3) << 11);
3384 *(bufoute++)=tmp&0xff;
3385 *(bufoute++)=tmp>>8;
3387 y=(bufy[j+yskip]-16)*Ky;
3389 tmp=(y+cbB)>>16;
3390 b=(tmp>255)?255:((tmp<0)?0:tmp);
3391 tmp=(y-crG-cbG)>>16;
3392 g=(tmp>255)?255:((tmp<0)?0:tmp);
3393 tmp=(y+crR)>>16;
3394 r=(tmp>255)?255:((tmp<0)?0:tmp);
3395 tmp=(int)((int)b >> 3);
3396 tmp|=(int)(((int)g >> 2) << 5);
3397 tmp|=(int)(((int)r >> 3) << 11);
3398 *(bufouto++)=tmp&0xff;
3399 *(bufouto++)=tmp>>8;
3401 y=(bufy[j+1+yskip]-16)*Ky;
3403 tmp=(y+cbB)>>16;
3404 b=(tmp>255)?255:((tmp<0)?0:tmp);
3405 tmp=(y-crG-cbG)>>16;
3406 g=(tmp>255)?255:((tmp<0)?0:tmp);
3407 tmp=(y+crR)>>16;
3408 r=(tmp>255)?255:((tmp<0)?0:tmp);
3409 tmp=(int)((int)b >> 3);
3410 tmp|=(int)(((int)g >> 2) << 5);
3411 tmp|=(int)(((int)r >> 3) << 11);
3412 *(bufouto++)=tmp&0xff;
3413 *(bufouto++)=tmp>>8;
3416 bufoute+=oskip;
3417 bufouto+=oskip;
3418 bufy+=yskip<<1;
3422 /* fix stride */
3424 void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb, int stride)
3426 bcopy(buf, rgb, RTjpeg_width*RTjpeg_height);