Add support for VDPAU video out, including hardware decoding.
[mplayer/glamo.git] / libmpcodecs / native / rtjpegn.c
blob538bff609ad4a1323f986ad7d13b0179c6ab97e5
1 /*
2 RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
4 With modifications by:
5 (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
6 and
7 (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #include "config.h"
30 #include "mpbswap.h"
31 #include "rtjpegn.h"
33 #if HAVE_MMX
34 #include "mmx.h"
35 #endif
37 //#define SHOWBLOCK 1
38 #define BETTERCOMPRESSION 1
40 static const unsigned char RTjpeg_ZZ[64]={
42 8, 1,
43 2, 9, 16,
44 24, 17, 10, 3,
45 4, 11, 18, 25, 32,
46 40, 33, 26, 19, 12, 5,
47 6, 13, 20, 27, 34, 41, 48,
48 56, 49, 42, 35, 28, 21, 14, 7,
49 15, 22, 29, 36, 43, 50, 57,
50 58, 51, 44, 37, 30, 23,
51 31, 38, 45, 52, 59,
52 60, 53, 46, 39,
53 47, 54, 61,
54 62, 55,
55 63 };
57 static const __u64 RTjpeg_aan_tab[64]={
58 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
59 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
60 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
61 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
62 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
63 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
64 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
65 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
68 #if !HAVE_MMX
69 static __s32 RTjpeg_ws[64+31];
70 #endif
71 __u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32];
73 static __s16 *block; // rh
74 static __s16 *RTjpeg_block;
75 static __s32 *RTjpeg_lqt;
76 static __s32 *RTjpeg_cqt;
77 static __u32 *RTjpeg_liqt;
78 static __u32 *RTjpeg_ciqt;
80 static unsigned char RTjpeg_lb8;
81 static unsigned char RTjpeg_cb8;
82 static int RTjpeg_width, RTjpeg_height;
83 static int RTjpeg_Ywidth, RTjpeg_Cwidth;
84 static int RTjpeg_Ysize, RTjpeg_Csize;
86 static __s16 *RTjpeg_old=NULL;
88 #if HAVE_MMX
89 mmx_t RTjpeg_lmask;
90 mmx_t RTjpeg_cmask;
91 #else
92 __u16 RTjpeg_lmask;
93 __u16 RTjpeg_cmask;
94 #endif
95 int RTjpeg_mtest=0;
97 static const unsigned char RTjpeg_lum_quant_tbl[64] = {
98 16, 11, 10, 16, 24, 40, 51, 61,
99 12, 12, 14, 19, 26, 58, 60, 55,
100 14, 13, 16, 24, 40, 57, 69, 56,
101 14, 17, 22, 29, 51, 87, 80, 62,
102 18, 22, 37, 56, 68, 109, 103, 77,
103 24, 35, 55, 64, 81, 104, 113, 92,
104 49, 64, 78, 87, 103, 121, 120, 101,
105 72, 92, 95, 98, 112, 100, 103, 99
108 static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
109 17, 18, 24, 47, 99, 99, 99, 99,
110 18, 21, 26, 66, 99, 99, 99, 99,
111 24, 26, 56, 99, 99, 99, 99, 99,
112 47, 66, 99, 99, 99, 99, 99, 99,
113 99, 99, 99, 99, 99, 99, 99, 99,
114 99, 99, 99, 99, 99, 99, 99, 99,
115 99, 99, 99, 99, 99, 99, 99, 99,
116 99, 99, 99, 99, 99, 99, 99, 99
119 #ifdef BETTERCOMPRESSION
121 /*--------------------------------------------------*/
122 /* better encoding, but needs a lot more cpu time */
123 /* seems to be more effective than old method +lzo */
124 /* with this encoding lzo isn't efficient anymore */
125 /* there is still more potential for better */
126 /* encoding but that would need even more cputime */
127 /* anyway your mileage may vary */
128 /* */
129 /* written by Martin BIELY and Roman HOCHLEITNER */
130 /*--------------------------------------------------*/
132 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
133 /* Block to Stream (encoding) */
134 /* */
136 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
138 register int ci, co=1;
139 register __s16 ZZvalue;
140 register unsigned char bitten;
141 register unsigned char bitoff;
143 #ifdef SHOWBLOCK
145 int ii;
146 for (ii=0; ii < 64; ii++) {
147 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
149 fprintf(stdout, "\n\n");
151 #endif
153 // *strm++ = 0x10;
154 // *strm = 0x00;
156 // return 2;
158 // first byte allways written
159 ((__u8*)strm)[0]=
160 (__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
163 ci=63;
164 while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
166 bitten = ((unsigned char)ci) << 2;
168 if (ci==0) {
169 ((__u8*)strm)[1]= bitten;
170 co = 2;
171 return (int)co;
174 /* bitoff=0 because the high 6bit contain first non zero position */
175 bitoff = 0;
176 co = 1;
178 for(; ci>0; ci--) {
180 ZZvalue = data[RTjpeg_ZZ[ci]];
182 switch(ZZvalue) {
183 case 0:
184 break;
185 case 1:
186 bitten |= (0x01<<bitoff);
187 break;
188 case -1:
189 bitten |= (0x03<<bitoff);
190 break;
191 default:
192 bitten |= (0x02<<bitoff);
193 goto HERZWEH;
194 break;
197 if( bitoff == 0 ) {
198 ((__u8*)strm)[co]= bitten;
199 bitten = 0;
200 bitoff = 8;
201 co++;
202 } /* "fall through" */
203 bitoff-=2;
207 /* ci must be 0 */
208 if(bitoff != 6) {
210 ((__u8*)strm)[co]= bitten;
211 co++;
214 goto BAUCHWEH;
216 HERZWEH:
217 /* ci cannot be 0 */
218 /* correct bitoff to nibble boundaries */
220 switch(bitoff){
221 case 4:
222 case 6:
223 bitoff = 0;
224 break;
225 case 2:
226 case 0:
227 ((__u8*)strm)[co]= bitten;
228 bitoff = 4;
229 co++;
230 bitten = 0; // clear half nibble values in bitten
231 break;
232 default:
233 break;
236 for(; ci>0; ci--) {
238 ZZvalue = data[RTjpeg_ZZ[ci]];
240 if( (ZZvalue > 7) || (ZZvalue < -7) ) {
241 bitten |= (0x08<<bitoff);
242 goto HIRNWEH;
245 bitten |= (ZZvalue&0xf)<<bitoff;
247 if( bitoff == 0 ) {
248 ((__u8*)strm)[co]= bitten;
249 bitten = 0;
250 bitoff = 8;
251 co++;
252 } /* "fall thru" */
253 bitoff-=4;
256 /* ci must be 0 */
257 if( bitoff == 0 ) {
258 ((__u8*)strm)[co]= bitten;
259 co++;
261 goto BAUCHWEH;
263 HIRNWEH:
265 ((__u8*)strm)[co]= bitten;
266 co++;
269 /* bitting is over now we bite */
270 for(; ci>0; ci--) {
272 ZZvalue = data[RTjpeg_ZZ[ci]];
274 if(ZZvalue>0)
276 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
278 else
280 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
286 BAUCHWEH:
287 /* we gotoo much now we are ill */
288 #ifdef SHOWBLOCK
290 int i;
291 fprintf(stdout, "\nco = '%d'\n", co);
292 for (i=0; i < co+2; i++) {
293 fprintf(stdout, "%d ", strm[i]);
295 fprintf(stdout, "\n\n");
297 #endif
299 return (int)co;
302 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
303 /* Stream to Block (decoding) */
304 /* */
306 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
308 int ci;
309 register int co;
310 register int i;
311 register unsigned char bitten;
312 register unsigned char bitoff;
314 /* first byte always read */
315 i=RTjpeg_ZZ[0];
316 data[i]=((__u8)strm[0])*qtbl[i];
318 /* we start at the behind */
320 bitten = ((unsigned char)strm[1]) >> 2;
321 co = 63;
322 for(; co > bitten; co--) {
324 data[RTjpeg_ZZ[co]] = 0;
328 if (co==0) {
329 ci = 2;
330 goto AUTOBAHN;
333 /* we have to read the last 2 bits of the second byte */
334 ci=1;
335 bitoff = 0;
337 for(; co>0; co--) {
339 bitten = ((unsigned char)strm[ci]) >> bitoff;
340 bitten &= 0x03;
342 i=RTjpeg_ZZ[co];
344 switch( bitten ) {
345 case 0x03:
346 data[i]= -qtbl[i];
347 break;
348 case 0x02:
349 goto FUSSWEG;
350 break;
351 case 0x01:
352 data[i]= qtbl[i];
353 break;
354 case 0x00:
355 data[i]= 0;
356 break;
357 default:
358 break;
361 if( bitoff == 0 ) {
362 bitoff = 8;
363 ci++;
365 bitoff -= 2;
367 /* co is 0 now */
368 /* data is written properly */
370 /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
371 if (bitoff!=6) ci++;
373 goto AUTOBAHN;
376 FUSSWEG:
377 /* correct bitoff to nibble */
378 switch(bitoff){
379 case 4:
380 case 6:
381 bitoff = 0;
382 break;
383 case 2:
384 case 0:
385 /* we have to read from the next byte */
386 ci++;
387 bitoff = 4;
388 break;
389 default:
390 break;
393 for(; co>0; co--) {
395 bitten = ((unsigned char)strm[ci]) >> bitoff;
396 bitten &= 0x0f;
398 i=RTjpeg_ZZ[co];
400 if( bitten == 0x08 ) {
401 goto STRASSE;
404 /* the compiler cannot do sign extension for signed nibbles */
405 if( bitten & 0x08 ) {
406 bitten |= 0xf0;
408 /* the unsigned char bitten now is a valid signed char */
410 data[i]=((signed char)bitten)*qtbl[i];
412 if( bitoff == 0 ) {
413 bitoff = 8;
414 ci++;
416 bitoff -= 4;
418 /* co is 0 */
420 /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
421 if (bitoff!=4) ci++;
423 goto AUTOBAHN;
425 STRASSE:
426 ci++;
428 for(; co>0; co--) {
429 i=RTjpeg_ZZ[co];
430 data[i]=strm[ci++]*qtbl[i];
433 /* ci now is the count, because it points to next element => no incrementing */
435 AUTOBAHN:
437 #ifdef SHOWBLOCK
438 fprintf(stdout, "\nci = '%d'\n", ci);
439 for (i=0; i < 64; i++) {
440 fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]);
442 fprintf(stdout, "\n\n");
443 #endif
445 return ci;
448 #else
450 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
452 register int ci, co=1, tmp;
453 register __s16 ZZvalue;
455 #ifdef SHOWBLOCK
457 int ii;
458 for (ii=0; ii < 64; ii++) {
459 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
461 fprintf(stdout, "\n\n");
463 #endif
465 (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
467 for(ci=1; ci<=bt8; ci++)
469 ZZvalue = data[RTjpeg_ZZ[ci]];
471 if(ZZvalue>0)
473 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
475 else
477 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
481 for(; ci<64; ci++)
483 ZZvalue = data[RTjpeg_ZZ[ci]];
485 if(ZZvalue>0)
487 strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue;
489 else if(ZZvalue<0)
491 strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue;
493 else /* compress zeros */
495 tmp=ci;
498 ci++;
500 while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
502 strm[co++]=(__s8)(63+(ci-tmp));
503 ci--;
506 return (int)co;
509 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
511 int ci=1, co=1, tmp;
512 register int i;
514 i=RTjpeg_ZZ[0];
515 data[i]=((__u8)strm[0])*qtbl[i];
517 for(co=1; co<=bt8; co++)
519 i=RTjpeg_ZZ[co];
520 data[i]=strm[ci++]*qtbl[i];
523 for(; co<64; co++)
525 if(strm[ci]>63)
527 tmp=co+strm[ci]-63;
528 for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
529 co--;
530 } else
532 i=RTjpeg_ZZ[co];
533 data[i]=strm[ci]*qtbl[i];
535 ci++;
537 return (int)ci;
539 #endif
541 #if HAVE_MMX
542 void RTjpeg_quant_init(void)
544 int i;
545 __s16 *qtbl;
547 qtbl=(__s16 *)RTjpeg_lqt;
548 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i];
550 qtbl=(__s16 *)RTjpeg_cqt;
551 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i];
554 static mmx_t RTjpeg_ones={0x0001000100010001LL};
555 static mmx_t RTjpeg_half={0x7fff7fff7fff7fffLL};
557 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
559 int i;
560 mmx_t *bl, *ql;
562 ql=(mmx_t *)qtbl;
563 bl=(mmx_t *)block;
565 movq_m2r(RTjpeg_ones, mm6);
566 movq_m2r(RTjpeg_half, mm7);
568 for(i=16; i; i--)
570 movq_m2r(*(ql++), mm0); /* quant vals (4) */
571 movq_m2r(*bl, mm2); /* block vals (4) */
572 movq_r2r(mm0, mm1);
573 movq_r2r(mm2, mm3);
575 punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
576 punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
578 punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
579 punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
581 pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
582 pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
584 psrad_i2r(16, mm0);
585 psrad_i2r(16, mm1);
587 packssdw_r2r(mm1, mm0);
589 movq_r2m(mm0, *(bl++));
593 #else
594 void RTjpeg_quant_init(void)
598 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
600 int i;
602 for(i=0; i<64; i++)
603 block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16);
605 #endif
608 * Perform the forward DCT on one block of samples.
610 #if HAVE_MMX
611 static mmx_t RTjpeg_C4 ={0x2D412D412D412D41LL};
612 static mmx_t RTjpeg_C6 ={0x187E187E187E187ELL};
613 static mmx_t RTjpeg_C2mC6={0x22A322A322A322A3LL};
614 static mmx_t RTjpeg_C2pC6={0x539F539F539F539FLL};
615 static mmx_t RTjpeg_zero ={0x0000000000000000LL};
617 #else
619 #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */
620 #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */
621 #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */
622 #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */
624 #define DESCALE10(x) (__s16)( ((x)+128) >> 8)
625 #define DESCALE20(x) (__s16)(((x)+32768) >> 16)
626 #define D_MULTIPLY(var,const) ((__s32) ((var) * (const)))
627 #endif
629 void RTjpeg_dct_init(void)
631 int i;
633 for(i=0; i<64; i++)
635 RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]);
636 RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]);
640 void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip)
642 #if !HAVE_MMX
643 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
644 __s32 tmp10, tmp11, tmp12, tmp13;
645 __s32 z1, z2, z3, z4, z5, z11, z13;
646 __u8 *idataptr;
647 __s16 *odataptr;
648 __s32 *wsptr;
649 int ctr;
651 idataptr = idata;
652 wsptr = RTjpeg_ws;
653 for (ctr = 7; ctr >= 0; ctr--) {
654 tmp0 = idataptr[0] + idataptr[7];
655 tmp7 = idataptr[0] - idataptr[7];
656 tmp1 = idataptr[1] + idataptr[6];
657 tmp6 = idataptr[1] - idataptr[6];
658 tmp2 = idataptr[2] + idataptr[5];
659 tmp5 = idataptr[2] - idataptr[5];
660 tmp3 = idataptr[3] + idataptr[4];
661 tmp4 = idataptr[3] - idataptr[4];
663 tmp10 = (tmp0 + tmp3); /* phase 2 */
664 tmp13 = tmp0 - tmp3;
665 tmp11 = (tmp1 + tmp2);
666 tmp12 = tmp1 - tmp2;
668 wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
669 wsptr[4] = (tmp10 - tmp11)<<8;
671 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
672 wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
673 wsptr[6] = (tmp13<<8) - z1;
675 tmp10 = tmp4 + tmp5; /* phase 2 */
676 tmp11 = tmp5 + tmp6;
677 tmp12 = tmp6 + tmp7;
679 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
680 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
681 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
682 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
684 z11 = (tmp7<<8) + z3; /* phase 5 */
685 z13 = (tmp7<<8) - z3;
687 wsptr[5] = z13 + z2; /* phase 6 */
688 wsptr[3] = z13 - z2;
689 wsptr[1] = z11 + z4;
690 wsptr[7] = z11 - z4;
692 idataptr += rskip<<3; /* advance pointer to next row */
693 wsptr += 8;
696 wsptr = RTjpeg_ws;
697 odataptr=odata;
698 for (ctr = 7; ctr >= 0; ctr--) {
699 tmp0 = wsptr[0] + wsptr[56];
700 tmp7 = wsptr[0] - wsptr[56];
701 tmp1 = wsptr[8] + wsptr[48];
702 tmp6 = wsptr[8] - wsptr[48];
703 tmp2 = wsptr[16] + wsptr[40];
704 tmp5 = wsptr[16] - wsptr[40];
705 tmp3 = wsptr[24] + wsptr[32];
706 tmp4 = wsptr[24] - wsptr[32];
708 tmp10 = tmp0 + tmp3; /* phase 2 */
709 tmp13 = tmp0 - tmp3;
710 tmp11 = tmp1 + tmp2;
711 tmp12 = tmp1 - tmp2;
713 odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
714 odataptr[32] = DESCALE10(tmp10 - tmp11);
716 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
717 odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
718 odataptr[48] = DESCALE20((tmp13<<8) - z1);
720 tmp10 = tmp4 + tmp5; /* phase 2 */
721 tmp11 = tmp5 + tmp6;
722 tmp12 = tmp6 + tmp7;
724 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
725 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
726 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
727 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
729 z11 = (tmp7<<8) + z3; /* phase 5 */
730 z13 = (tmp7<<8) - z3;
732 odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
733 odataptr[24] = DESCALE20(z13 - z2);
734 odataptr[8] = DESCALE20(z11 + z4);
735 odataptr[56] = DESCALE20(z11 - z4);
737 odataptr++; /* advance pointer to next column */
738 wsptr++;
740 #else
741 volatile mmx_t tmp6, tmp7;
742 register mmx_t *dataptr = (mmx_t *)odata;
743 mmx_t *idata2 = (mmx_t *)idata;
745 // first copy the input 8 bit to the destination 16 bits
747 movq_m2r(RTjpeg_zero, mm2);
750 movq_m2r(*idata2, mm0);
751 movq_r2r(mm0, mm1);
753 punpcklbw_r2r(mm2, mm0);
754 movq_r2m(mm0, *(dataptr));
756 punpckhbw_r2r(mm2, mm1);
757 movq_r2m(mm1, *(dataptr+1));
759 idata2 += rskip;
761 movq_m2r(*idata2, mm0);
762 movq_r2r(mm0, mm1);
764 punpcklbw_r2r(mm2, mm0);
765 movq_r2m(mm0, *(dataptr+2));
767 punpckhbw_r2r(mm2, mm1);
768 movq_r2m(mm1, *(dataptr+3));
770 idata2 += rskip;
772 movq_m2r(*idata2, mm0);
773 movq_r2r(mm0, mm1);
775 punpcklbw_r2r(mm2, mm0);
776 movq_r2m(mm0, *(dataptr+4));
778 punpckhbw_r2r(mm2, mm1);
779 movq_r2m(mm1, *(dataptr+5));
781 idata2 += rskip;
783 movq_m2r(*idata2, mm0);
784 movq_r2r(mm0, mm1);
786 punpcklbw_r2r(mm2, mm0);
787 movq_r2m(mm0, *(dataptr+6));
789 punpckhbw_r2r(mm2, mm1);
790 movq_r2m(mm1, *(dataptr+7));
792 idata2 += rskip;
794 movq_m2r(*idata2, mm0);
795 movq_r2r(mm0, mm1);
797 punpcklbw_r2r(mm2, mm0);
798 movq_r2m(mm0, *(dataptr+8));
800 punpckhbw_r2r(mm2, mm1);
801 movq_r2m(mm1, *(dataptr+9));
803 idata2 += rskip;
805 movq_m2r(*idata2, mm0);
806 movq_r2r(mm0, mm1);
808 punpcklbw_r2r(mm2, mm0);
809 movq_r2m(mm0, *(dataptr+10));
811 punpckhbw_r2r(mm2, mm1);
812 movq_r2m(mm1, *(dataptr+11));
814 idata2 += rskip;
816 movq_m2r(*idata2, mm0);
817 movq_r2r(mm0, mm1);
819 punpcklbw_r2r(mm2, mm0);
820 movq_r2m(mm0, *(dataptr+12));
822 punpckhbw_r2r(mm2, mm1);
823 movq_r2m(mm1, *(dataptr+13));
825 idata2 += rskip;
827 movq_m2r(*idata2, mm0);
828 movq_r2r(mm0, mm1);
830 punpcklbw_r2r(mm2, mm0);
831 movq_r2m(mm0, *(dataptr+14));
833 punpckhbw_r2r(mm2, mm1);
834 movq_r2m(mm1, *(dataptr+15));
836 /* Start Transpose to do calculations on rows */
838 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5
840 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
841 movq_r2r(mm7, mm5);
843 punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines
844 movq_r2r(mm6, mm2);
846 punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines
847 movq_r2r(mm7, mm1);
849 movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line
850 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
852 movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line
853 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
855 movq_r2m(mm7,*(dataptr+9)); // write result 1
856 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
858 movq_r2m(mm1,*(dataptr+11)); // write result 2
859 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
861 movq_r2r(mm5, mm1);
862 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
864 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
865 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
867 movq_r2m(mm5,*(dataptr+13)); // write result 3
869 // last 4x4 done
871 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
873 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
874 movq_r2r(mm0, mm6);
876 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
877 movq_r2r(mm2, mm7);
879 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
880 movq_r2r(mm0, mm4);
883 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
884 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
886 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
887 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
889 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
890 movq_r2r(mm1, mm2); // copy first line
892 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
893 movq_r2r(mm6, mm5); // copy first intermediate result
895 movq_r2m(mm0, *(dataptr+8)); // write result 1
896 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
898 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
899 movq_r2r(mm3, mm0); // copy third line
901 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
903 movq_r2m(mm4, *(dataptr+10)); // write result 2 out
904 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
906 punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines
907 movq_r2r(mm1, mm4);
909 movq_r2m(mm6, *(dataptr+12)); // write result 3 out
910 punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result
912 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
913 movq_r2r(mm2, mm6);
915 movq_r2m(mm5, *(dataptr+14)); // write result 4 out
916 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
918 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
919 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
921 movq_r2m(mm4, *(dataptr+3)); // write result 6 out
922 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
924 movq_r2m(mm2, *(dataptr+5)); // write result 7 out
926 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
928 movq_r2m(mm6, *(dataptr+7)); // write result 8 out
931 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
933 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
934 movq_r2r(mm0, mm2);
936 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
937 movq_r2r(mm7, mm4);
939 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
940 movq_r2r(mm0, mm1);
942 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
943 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
945 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
946 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
948 movq_r2r(mm0, mm7); // write result 1
949 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
951 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
952 movq_r2r(mm1, mm6); // write result 2
954 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
955 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
957 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
958 movq_r2r(mm2, mm3); // copy first intermediate result
960 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
961 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
963 movq_r2m(mm7, tmp7);
964 movq_r2r(mm2, mm5); // write result 3
966 movq_r2m(mm6, tmp6);
967 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
969 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */
970 movq_r2r(mm3, mm4); // write result 4
972 /************************************************************************************************
973 End of Transpose
974 ************************************************************************************************/
977 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
978 movq_r2r(mm0, mm7);
980 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
981 movq_r2r(mm1, mm6);
983 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
984 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
986 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
987 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
989 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
990 paddw_r2r(mm7, mm6); // tmp12 + tmp13
992 /* stage 3 */
994 movq_m2r(tmp6, mm2);
995 movq_r2r(mm0, mm3);
997 psllw_i2r(2, mm6); // m8 * 2^2
998 paddw_r2r(mm1, mm0);
1000 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1001 psubw_r2r(mm1, mm3);
1003 movq_r2m(mm0, *dataptr);
1004 movq_r2r(mm7, mm0);
1006 /* Odd part */
1007 movq_r2m(mm3, *(dataptr+8));
1008 paddw_r2r(mm5, mm4); // tmp10
1010 movq_m2r(tmp7, mm3);
1011 paddw_r2r(mm6, mm0); // tmp32
1013 paddw_r2r(mm2, mm5); // tmp11
1014 psubw_r2r(mm6, mm7); // tmp33
1016 movq_r2m(mm0, *(dataptr+4));
1017 paddw_r2r(mm3, mm2); // tmp12
1019 /* stage 4 */
1021 movq_r2m(mm7, *(dataptr+12));
1022 movq_r2r(mm4, mm1); // copy of tmp10
1024 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1025 psllw_i2r(2, mm4); // m8 * 2^2
1027 movq_m2r(RTjpeg_C2mC6, mm0);
1028 psllw_i2r(2, mm1);
1030 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1031 psllw_i2r(2, mm2);
1033 pmulhw_r2r(mm0, mm4); // z5
1035 /* stage 5 */
1037 pmulhw_m2r(RTjpeg_C2pC6, mm2);
1038 psllw_i2r(2, mm5);
1040 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1041 movq_r2r(mm3, mm0); // copy tmp7
1043 movq_m2r(*(dataptr+1), mm7);
1044 paddw_r2r(mm1, mm4); // z2
1046 paddw_r2r(mm1, mm2); // z4
1048 paddw_r2r(mm5, mm0); // z11
1049 psubw_r2r(mm5, mm3); // z13
1051 /* stage 6 */
1053 movq_r2r(mm3, mm5); // copy z13
1054 psubw_r2r(mm4, mm3); // y3=z13 - z2
1056 paddw_r2r(mm4, mm5); // y5=z13 + z2
1057 movq_r2r(mm0, mm6); // copy z11
1059 movq_r2m(mm3, *(dataptr+6)); //save y3
1060 psubw_r2r(mm2, mm0); // y7=z11 - z4
1062 movq_r2m(mm5, *(dataptr+10)); //save y5
1063 paddw_r2r(mm2, mm6); // y1=z11 + z4
1065 movq_r2m(mm0, *(dataptr+14)); //save y7
1067 /************************************************
1068 * End of 1st 4 rows
1069 ************************************************/
1071 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1072 movq_r2r(mm7, mm0); // copy x0
1074 movq_r2m(mm6, *(dataptr+2)); //save y1
1076 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1077 movq_r2r(mm1, mm6); // copy x1
1079 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1081 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1082 movq_r2r(mm2, mm5); // copy x2
1084 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1085 movq_r2r(mm3, mm4); // copy x3
1087 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1089 movq_r2m(mm7, tmp7); // save tmp07
1090 movq_r2r(mm0, mm7); // copy tmp00
1092 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1094 /* stage 2, Even Part */
1096 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1098 movq_r2m(mm6, tmp6); // save tmp07
1099 movq_r2r(mm1, mm6); // copy tmp01
1101 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1102 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1104 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1106 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1107 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1109 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1111 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1112 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1114 /* stage 3, Even and stage 4 & 5 even */
1116 movq_m2r(tmp6, mm2); // load tmp6
1117 movq_r2r(mm0, mm3); // copy tmp10
1119 psllw_i2r(2, mm6); // shift z1
1120 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1122 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1123 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1125 movq_r2m(mm0, *(dataptr+1)); //save y0
1126 movq_r2r(mm7, mm0); // copy tmp13
1128 /* odd part */
1130 movq_r2m(mm3, *(dataptr+9)); //save y4
1131 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1133 movq_m2r(tmp7, mm3); // load tmp7
1134 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1136 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1137 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1139 movq_r2m(mm0, *(dataptr+5)); //save y2
1140 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1142 /* stage 4 */
1144 movq_r2m(mm7, *(dataptr+13)); //save y6
1145 movq_r2r(mm4, mm1); // copy tmp10
1147 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1148 psllw_i2r(2, mm4); // shift tmp10
1150 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1151 psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1153 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1154 psllw_i2r(2, mm5); // prepare for multiply
1156 pmulhw_r2r(mm0, mm4); // multiply by converted real
1158 /* stage 5 */
1160 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1161 psllw_i2r(2, mm2); // prepare for multiply
1163 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1164 movq_r2r(mm3, mm0); // copy tmp7
1166 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1167 paddw_r2r(mm1, mm4); // z2
1169 paddw_r2r(mm5, mm0); // z11
1170 psubw_r2r(mm5, mm3); // z13
1172 /* stage 6 */
1174 movq_r2r(mm3, mm5); // copy z13
1175 paddw_r2r(mm1, mm2); // z4
1177 movq_r2r(mm0, mm6); // copy z11
1178 psubw_r2r(mm4, mm5); // y3
1180 paddw_r2r(mm2, mm6); // y1
1181 paddw_r2r(mm4, mm3); // y5
1183 movq_r2m(mm5, *(dataptr+7)); //save y3
1185 movq_r2m(mm6, *(dataptr+3)); //save y1
1186 psubw_r2r(mm2, mm0); // y7
1188 /************************************************************************************************
1189 Start of Transpose
1190 ************************************************************************************************/
1192 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
1193 movq_r2r(mm7, mm5); // copy first line
1195 punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines
1196 movq_r2r(mm6, mm2); // copy third line
1198 punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines
1199 movq_r2r(mm7, mm1); // copy first intermediate result
1201 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
1203 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1205 movq_r2m(mm7, *(dataptr+9)); // write result 1
1206 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
1208 movq_r2m(mm1, *(dataptr+11)); // write result 2
1209 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
1211 movq_r2r(mm5, mm1); // copy first intermediate result
1212 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
1214 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
1215 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
1217 movq_r2m(mm5, *(dataptr+13)); // write result 3
1219 /****** last 4x4 done */
1221 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
1223 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
1224 movq_r2r(mm0, mm6); // copy first line
1226 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
1227 movq_r2r(mm2, mm7); // copy third line
1229 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
1230 movq_r2r(mm0, mm4); // copy first intermediate result
1234 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
1235 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
1237 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
1238 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
1240 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
1241 movq_r2r(mm1, mm2); // copy first line
1243 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
1244 movq_r2r(mm6, mm5); // copy first intermediate result
1246 movq_r2m(mm0, *(dataptr+8)); // write result 1
1247 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
1249 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
1250 movq_r2r(mm3, mm0); // copy third line
1252 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
1254 movq_r2m(mm4, *(dataptr+10)); // write result 2 out
1255 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
1257 punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines
1258 movq_r2r(mm1, mm4); // copy second intermediate result
1260 movq_r2m(mm6, *(dataptr+12)); // write result 3 out
1261 punpckldq_r2r(mm3, mm1); //
1263 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
1264 movq_r2r(mm2, mm6); // copy second intermediate result
1266 movq_r2m(mm5, *(dataptr+14)); // write result 4 out
1267 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
1269 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
1270 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
1272 movq_r2m(mm4, *(dataptr+3)); // write result 6 out
1273 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
1275 movq_r2m(mm2, *(dataptr+5)); // write result 7 out
1277 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
1279 movq_r2m(mm6, *(dataptr+7)); // write result 8 out
1281 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
1283 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
1284 movq_r2r(mm0, mm2); // copy first line
1286 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
1287 movq_r2r(mm7, mm4); // copy third line
1289 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
1290 movq_r2r(mm0, mm1); // copy first intermediate result
1292 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
1293 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
1295 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
1296 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1298 movq_r2r(mm0, mm7); // write result 1
1299 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
1301 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
1302 movq_r2r(mm1, mm6); // write result 2
1304 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
1305 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
1307 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
1308 movq_r2r(mm2, mm3); // copy first intermediate result
1310 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
1311 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
1313 movq_r2m(mm7, tmp7); // save tmp07
1314 movq_r2r(mm2, mm5); // write result 3
1316 movq_r2m(mm6, tmp6); // save tmp06
1318 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
1320 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */
1321 movq_r2r(mm3, mm4); // write result 4
1323 /************************************************************************************************
1324 End of Transpose 2
1325 ************************************************************************************************/
1327 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
1328 movq_r2r(mm0, mm7);
1330 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
1331 movq_r2r(mm1, mm6);
1333 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
1334 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
1336 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
1337 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
1339 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
1340 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1342 /* stage 3 */
1344 movq_m2r(tmp6, mm2);
1345 movq_r2r(mm0, mm3);
1347 psllw_i2r(2, mm6); // m8 * 2^2
1348 paddw_r2r(mm1, mm0);
1350 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1351 psubw_r2r(mm1, mm3);
1353 movq_r2m(mm0, *dataptr);
1354 movq_r2r(mm7, mm0);
1356 /* Odd part */
1357 movq_r2m(mm3, *(dataptr+8));
1358 paddw_r2r(mm5, mm4); // tmp10
1360 movq_m2r(tmp7, mm3);
1361 paddw_r2r(mm6, mm0); // tmp32
1363 paddw_r2r(mm2, mm5); // tmp11
1364 psubw_r2r(mm6, mm7); // tmp33
1366 movq_r2m(mm0, *(dataptr+4));
1367 paddw_r2r(mm3, mm2); // tmp12
1369 /* stage 4 */
1370 movq_r2m(mm7, *(dataptr+12));
1371 movq_r2r(mm4, mm1); // copy of tmp10
1373 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1374 psllw_i2r(2, mm4); // m8 * 2^2
1376 movq_m2r(RTjpeg_C2mC6, mm0);
1377 psllw_i2r(2, mm1);
1379 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1380 psllw_i2r(2, mm2);
1382 pmulhw_r2r(mm0, mm4); // z5
1384 /* stage 5 */
1386 pmulhw_m2r(RTjpeg_C2pC6, mm2);
1387 psllw_i2r(2, mm5);
1389 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1390 movq_r2r(mm3, mm0); // copy tmp7
1392 movq_m2r(*(dataptr+1), mm7);
1393 paddw_r2r(mm1, mm4); // z2
1395 paddw_r2r(mm1, mm2); // z4
1397 paddw_r2r(mm5, mm0); // z11
1398 psubw_r2r(mm5, mm3); // z13
1400 /* stage 6 */
1402 movq_r2r(mm3, mm5); // copy z13
1403 psubw_r2r(mm4, mm3); // y3=z13 - z2
1405 paddw_r2r(mm4, mm5); // y5=z13 + z2
1406 movq_r2r(mm0, mm6); // copy z11
1408 movq_r2m(mm3, *(dataptr+6)); //save y3
1409 psubw_r2r(mm2, mm0); // y7=z11 - z4
1411 movq_r2m(mm5, *(dataptr+10)); //save y5
1412 paddw_r2r(mm2, mm6); // y1=z11 + z4
1414 movq_r2m(mm0, *(dataptr+14)); //save y7
1416 /************************************************
1417 * End of 1st 4 rows
1418 ************************************************/
1420 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1421 movq_r2r(mm7, mm0); // copy x0
1423 movq_r2m(mm6, *(dataptr+2)); //save y1
1425 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1426 movq_r2r(mm1, mm6); // copy x1
1428 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1430 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1431 movq_r2r(mm2, mm5); // copy x2
1433 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1434 movq_r2r(mm3, mm4); // copy x3
1436 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1438 movq_r2m(mm7, tmp7); // save tmp07
1439 movq_r2r(mm0, mm7); // copy tmp00
1441 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1443 /* stage 2, Even Part */
1445 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1447 movq_r2m(mm6, tmp6); // save tmp07
1448 movq_r2r(mm1, mm6); // copy tmp01
1450 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1451 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1453 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1455 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1456 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1458 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1460 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1461 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1463 /* stage 3, Even and stage 4 & 5 even */
1465 movq_m2r(tmp6, mm2); // load tmp6
1466 movq_r2r(mm0, mm3); // copy tmp10
1468 psllw_i2r(2, mm6); // shift z1
1469 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1471 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1472 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1474 movq_r2m(mm0, *(dataptr+1)); //save y0
1475 movq_r2r(mm7, mm0); // copy tmp13
1477 /* odd part */
1479 movq_r2m(mm3, *(dataptr+9)); //save y4
1480 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1482 movq_m2r(tmp7, mm3); // load tmp7
1483 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1485 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1486 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1488 movq_r2m(mm0, *(dataptr+5)); //save y2
1489 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1491 /* stage 4 */
1493 movq_r2m(mm7, *(dataptr+13)); //save y6
1494 movq_r2r(mm4, mm1); // copy tmp10
1496 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1497 psllw_i2r(2, mm4); // shift tmp10
1499 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1500 psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1502 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1503 psllw_i2r(2, mm5); // prepare for multiply
1505 pmulhw_r2r(mm0, mm4); // multiply by converted real
1507 /* stage 5 */
1509 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1510 psllw_i2r(2, mm2); // prepare for multiply
1512 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1513 movq_r2r(mm3, mm0); // copy tmp7
1515 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1516 paddw_r2r(mm1, mm4); // z2
1518 paddw_r2r(mm5, mm0); // z11
1519 psubw_r2r(mm5, mm3); // z13
1521 /* stage 6 */
1523 movq_r2r(mm3, mm5); // copy z13
1524 paddw_r2r(mm1, mm2); // z4
1526 movq_r2r(mm0, mm6); // copy z11
1527 psubw_r2r(mm4, mm5); // y3
1529 paddw_r2r(mm2, mm6); // y1
1530 paddw_r2r(mm4, mm3); // y5
1532 movq_r2m(mm5, *(dataptr+7)); //save y3
1533 psubw_r2r(mm2, mm0); // yè=z11 - z4
1535 movq_r2m(mm3, *(dataptr+11)); //save y5
1537 movq_r2m(mm6, *(dataptr+3)); //save y1
1539 movq_r2m(mm0, *(dataptr+15)); //save y7
1542 #endif
1545 #define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */
1546 #define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */
1547 #define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */
1548 #define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */
1550 #define DESCALE(x) (__s16)( ((x)+4) >> 3)
1552 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
1554 #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
1555 #define MULTIPLY(var,const) (((__s32) ((var) * (const)) + 128)>>8)
1557 void RTjpeg_idct_init(void)
1559 int i;
1561 for(i=0; i<64; i++)
1563 RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32;
1564 RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32;
1568 void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip)
1570 #if HAVE_MMX
1572 static mmx_t fix_141 = {0x5a825a825a825a82LL};
1573 static mmx_t fix_184n261 = {0xcf04cf04cf04cf04LL};
1574 static mmx_t fix_184 = {0x7641764176417641LL};
1575 static mmx_t fix_n184 = {0x896f896f896f896fLL};
1576 static mmx_t fix_108n184 = {0xcf04cf04cf04cf04LL};
1578 mmx_t workspace[64];
1579 mmx_t *wsptr = workspace;
1580 register mmx_t *dataptr = (mmx_t *)odata;
1581 mmx_t *idata = (mmx_t *)data;
1583 rskip = rskip>>3;
1585 * Perform inverse DCT on one block of coefficients.
1588 /* Odd part */
1590 movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1592 movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1594 movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1596 movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1598 movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1600 paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1602 psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1604 psllw_i2r(2, mm2); // shift z10
1605 movq_r2r(mm2, mm0); // copy z10
1607 pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1608 movq_r2r(mm3, mm5); // copy tmp4
1610 pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1611 paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1613 movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1614 psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1616 psubw_r2r(mm1, mm6); // z11-z13
1617 psllw_i2r(2, mm5); // shift z12
1619 movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1620 movq_r2r(mm5, mm7); // copy z12
1622 pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1623 paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1625 //ok
1627 /* Even part */
1628 pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1629 psllw_i2r(2, mm6);
1631 movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1633 paddw_r2r(mm5, mm0); // tmp10
1635 paddw_r2r(mm7, mm2); // tmp12
1637 pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1638 psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1640 movq_r2r(mm1, mm5); // copy tmp1
1641 paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1643 psubw_r2r(mm4, mm5); // tmp1-tmp3
1644 psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1646 movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1647 psllw_i2r(2, mm5); // shift tmp1-tmp3
1649 movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1651 pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1652 paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1654 movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1656 psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1658 movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1659 movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1661 movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1662 psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1664 paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1665 movq_r2r(mm1, mm5); // copy tmp11
1667 paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1668 movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1670 paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1672 psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1673 movq_r2r(mm7, mm0); // copy tmp0
1675 psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1676 paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1678 psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1680 movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1681 movq_r2r(mm1, mm3); // copy tmp1
1683 movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1684 paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1686 psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1688 movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1689 movq_r2r(mm4, mm1); // copy tmp3
1691 movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1693 paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1695 psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1697 movq_r2m(mm4, *(wsptr+8));
1698 movq_r2r(mm5, mm7); // copy tmp2
1700 paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1702 movq_r2m(mm1, *(wsptr+6));
1703 psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1705 movq_r2m(mm5, *(wsptr+4));
1707 movq_r2m(mm7, *(wsptr+10));
1709 //ok
1712 /*****************************************************************/
1714 idata++;
1715 wsptr++;
1717 /*****************************************************************/
1719 movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1721 movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1723 movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1724 movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1726 movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1727 paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1729 psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1731 psllw_i2r(2, mm2); // shift z10
1732 movq_r2r(mm2, mm0); // copy z10
1734 pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1735 movq_r2r(mm3, mm5); // copy tmp4
1737 pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1738 paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1740 movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1741 psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1743 psubw_r2r(mm1, mm6); // z11-z13
1744 psllw_i2r(2, mm5); // shift z12
1746 movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1747 movq_r2r(mm5, mm7); // copy z12
1749 pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1750 paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1752 //ok
1754 /* Even part */
1755 pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1756 psllw_i2r(2, mm6);
1758 movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1760 paddw_r2r(mm5, mm0); // tmp10
1762 paddw_r2r(mm7, mm2); // tmp12
1764 pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1765 psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1767 movq_r2r(mm1, mm5); // copy tmp1
1768 paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1770 psubw_r2r(mm4, mm5); // tmp1-tmp3
1771 psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1773 movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1774 psllw_i2r(2, mm5); // shift tmp1-tmp3
1776 movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1777 paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1779 pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1781 movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1783 psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1785 movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1786 movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1788 movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1789 psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1791 paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1792 movq_r2r(mm1, mm5); // copy tmp11
1794 paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1795 movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1797 paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1799 psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1800 movq_r2r(mm7, mm0); // copy tmp0
1802 psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1803 paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1805 psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1807 movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1808 movq_r2r(mm1, mm3); // copy tmp1
1810 movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1811 paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1813 psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1815 movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1816 movq_r2r(mm4, mm1); // copy tmp3
1818 movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1820 paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1822 psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1824 movq_r2m(mm4, *(wsptr+8));
1825 movq_r2r(mm5, mm7); // copy tmp2
1827 paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1829 movq_r2m(mm1, *(wsptr+6));
1830 psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1832 movq_r2m(mm5, *(wsptr+4));
1834 movq_r2m(mm7, *(wsptr+10));
1836 /*****************************************************************/
1838 /* Pass 2: process rows from work array, store into output array. */
1839 /* Note that we must descale the results by a factor of 8 == 2**3, */
1840 /* and also undo the PASS1_BITS scaling. */
1842 /*****************************************************************/
1843 /* Even part */
1845 wsptr--;
1847 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
1848 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
1849 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
1850 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
1851 movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
1853 movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
1854 movq_r2r(mm0, mm2);
1856 movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
1857 paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
1859 movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
1860 psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
1862 movq_r2r(mm0, mm6);
1863 movq_r2r(mm3, mm5);
1865 paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
1866 movq_r2r(mm2, mm1);
1868 psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
1869 punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
1871 movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
1872 punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
1874 movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
1875 punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1877 punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
1878 movq_r2r(mm3, mm4);
1880 movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
1881 punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
1883 movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
1884 punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1887 paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
1888 movq_r2r(mm6, mm2);
1890 psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
1891 paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
1893 movq_r2r(mm3, mm5);
1894 punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
1896 psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
1897 punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
1899 movq_r2r(mm4, mm7);
1900 punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
1902 punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
1904 punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
1906 punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
1907 movq_r2r(mm1, mm6);
1909 //ok
1911 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1912 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1915 movq_r2r(mm0, mm2);
1916 punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
1918 punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
1919 psllw_i2r(2, mm6);
1921 pmulhw_m2r(fix_141, mm6);
1922 punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
1924 punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
1925 movq_r2r(mm0, mm7);
1927 // tmp0 = tmp10 + tmp13;
1928 // tmp3 = tmp10 - tmp13;
1929 paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
1930 psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1932 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1933 psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1934 // tmp1 = tmp11 + tmp12;
1935 // tmp2 = tmp11 - tmp12;
1936 movq_r2r(mm1, mm5);
1938 //OK
1940 /* Odd part */
1942 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1943 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1944 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1945 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1946 movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
1947 paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1949 movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
1950 psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1952 movq_r2r(mm3, mm6);
1953 punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
1955 punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
1956 movq_r2r(mm3, mm2);
1958 //Save tmp0 and tmp1 in wsptr
1959 movq_r2m(mm0, *(wsptr)); // save tmp0
1960 paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
1963 //Continue with z10 --- z13
1964 movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
1965 psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
1967 movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
1968 movq_r2r(mm6, mm4);
1970 movq_r2m(mm1, *(wsptr+1)); // save tmp1
1971 punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
1973 punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
1974 movq_r2r(mm6, mm1);
1976 //Save tmp2 and tmp3 in wsptr
1977 paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
1978 movq_r2r(mm2, mm4);
1980 //Continue with z10 --- z13
1981 movq_r2m(mm5, *(wsptr+2)); // save tmp2
1982 punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
1984 psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
1985 punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
1987 movq_r2r(mm3, mm0);
1988 punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
1990 movq_r2m(mm7, *(wsptr+3)); // save tmp3
1991 punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
1993 movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
1994 punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1996 movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
1997 punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
1999 movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
2000 movq_r2r(mm6, mm4);
2002 punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2003 movq_r2r(mm1, mm5);
2005 punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2006 movq_r2r(mm6, mm2);
2008 movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2009 paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2011 psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2012 punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2014 punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2015 movq_r2r(mm1, mm7);
2017 paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2018 psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2020 movq_r2r(mm6, mm5);
2021 punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2023 punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2024 movq_r2r(mm2, mm4);
2026 punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2028 punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2030 punpckhdq_r2r(mm6, mm4); /// wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2032 punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2033 movq_r2r(mm0, mm5);
2035 punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2037 punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2038 movq_r2r(mm3, mm4);
2040 punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2041 movq_r2r(mm5, mm1);
2043 punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2044 // tmp7 = z11 + z13; /* phase 5 */
2045 // tmp8 = z11 - z13; /* phase 5 */
2046 psubw_r2r(mm4, mm1); // tmp8
2048 paddw_r2r(mm4, mm5); // tmp7
2049 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2050 psllw_i2r(2, mm1);
2052 psllw_i2r(2, mm0);
2054 pmulhw_m2r(fix_141, mm1); // tmp21
2055 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2056 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2057 psllw_i2r(2, mm3);
2058 movq_r2r(mm0, mm7);
2060 pmulhw_m2r(fix_n184, mm7);
2061 movq_r2r(mm3, mm6);
2063 movq_m2r(*(wsptr), mm2); // tmp0,final1
2065 pmulhw_m2r(fix_108n184, mm6);
2066 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2067 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2068 movq_r2r(mm2, mm4); // final1
2070 pmulhw_m2r(fix_184n261, mm0);
2071 paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2073 pmulhw_m2r(fix_184, mm3);
2074 psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2076 // tmp6 = tmp22 - tmp7; /* phase 2 */
2077 psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2079 paddw_r2r(mm6, mm7); // tmp20
2080 psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2082 paddw_r2r(mm0, mm3); // tmp22
2084 // tmp5 = tmp21 - tmp6;
2085 psubw_r2r(mm5, mm3); // tmp6
2087 // tmp4 = tmp20 + tmp5;
2088 movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2089 psubw_r2r(mm3, mm1); // tmp5
2091 movq_r2r(mm0, mm6); // final2
2092 paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2094 /* Final output stage: scale down by a factor of 8 and range-limit */
2097 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2098 // & RANGE_MASK];
2099 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2100 // & RANGE_MASK]; final1
2103 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2104 // & RANGE_MASK];
2105 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2106 // & RANGE_MASK]; final2
2107 psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2108 psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2110 psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2112 packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2114 movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2115 packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2117 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2118 // & RANGE_MASK];
2119 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2120 // & RANGE_MASK]; final3
2121 paddw_r2r(mm1, mm7); // tmp4
2122 movq_r2r(mm5, mm3);
2124 paddw_r2r(mm1, mm5); // tmp2+tmp5
2125 psubw_r2r(mm1, mm3); // tmp2-tmp5
2127 psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2129 movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2130 psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2134 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2135 // & RANGE_MASK];
2136 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2137 // & RANGE_MASK]; final4
2138 movq_r2r(mm4, mm6);
2139 paddw_r2r(mm7, mm4); // tmp3+tmp4
2141 psubw_r2r(mm7, mm6); // tmp3-tmp4
2142 psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2144 // mov ecx, [dataptr]
2146 psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2148 packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2150 packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2151 movq_r2r(mm2, mm4);
2153 movq_r2r(mm5, mm7);
2154 punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2156 punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2157 movq_r2r(mm2, mm1);
2159 punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2161 // add dataptr, 4
2163 punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2165 punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2167 // add ecx, output_col
2169 movq_r2r(mm7, mm6);
2170 punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2172 movq_r2r(mm2, mm0);
2173 punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2175 // mov idata, [dataptr]
2177 punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2179 // add dataptr, 4
2181 movq_r2r(mm1, mm3);
2183 // add idata, output_col
2185 punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2187 movq_r2m(mm2, *(dataptr));
2189 punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2191 dataptr += rskip;
2192 movq_r2m(mm0, *(dataptr));
2194 punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2195 punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2197 dataptr += rskip;
2198 movq_r2m(mm1, *(dataptr));
2200 dataptr += rskip;
2201 movq_r2m(mm3, *(dataptr));
2203 /*******************************************************************/
2205 wsptr += 8;
2207 /*******************************************************************/
2209 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
2210 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
2211 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
2212 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
2213 movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
2215 movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
2216 movq_r2r(mm0, mm2);
2218 movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
2219 paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
2221 movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
2222 psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
2224 movq_r2r(mm0, mm6);
2225 movq_r2r(mm3, mm5);
2227 paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
2228 movq_r2r(mm2, mm1);
2230 psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
2231 punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
2233 movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
2234 punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
2236 movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
2237 punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2239 punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
2240 movq_r2r(mm3, mm4);
2242 movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
2243 punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
2245 movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
2246 punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2248 paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
2249 movq_r2r(mm6, mm2);
2251 psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
2252 paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
2254 movq_r2r(mm3, mm5);
2255 punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
2257 psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
2258 punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
2260 movq_r2r(mm4, mm7);
2261 punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
2263 punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
2265 punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
2267 punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
2268 movq_r2r(mm1, mm6);
2270 //OK
2272 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2273 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2275 movq_r2r(mm0, mm2);
2276 punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
2278 punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
2279 psllw_i2r(2, mm6);
2281 pmulhw_m2r(fix_141, mm6);
2282 punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
2284 punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
2285 movq_r2r(mm0, mm7);
2287 // tmp0 = tmp10 + tmp13;
2288 // tmp3 = tmp10 - tmp13;
2289 paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
2290 psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
2292 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
2293 psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
2294 // tmp1 = tmp11 + tmp12;
2295 // tmp2 = tmp11 - tmp12;
2296 movq_r2r(mm1, mm5);
2298 //OK
2301 /* Odd part */
2303 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
2304 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
2305 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
2306 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
2307 movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
2308 paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
2310 movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
2311 psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
2313 movq_r2r(mm3, mm6);
2314 punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
2316 punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
2317 movq_r2r(mm3, mm2);
2319 //Save tmp0 and tmp1 in wsptr
2320 movq_r2m(mm0, *(wsptr)); // save tmp0
2321 paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
2324 //Continue with z10 --- z13
2325 movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
2326 psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
2328 movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
2329 movq_r2r(mm6, mm4);
2331 movq_r2m(mm1, *(wsptr+1)); // save tmp1
2332 punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
2334 punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
2335 movq_r2r(mm6, mm1);
2337 //Save tmp2 and tmp3 in wsptr
2338 paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
2339 movq_r2r(mm2, mm4);
2341 //Continue with z10 --- z13
2342 movq_r2m(mm5, *(wsptr+2)); // save tmp2
2343 punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
2345 psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
2346 punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
2348 movq_r2r(mm3, mm0);
2349 punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
2351 movq_r2m(mm7, *(wsptr+3)); // save tmp3
2352 punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
2354 movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
2355 punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
2357 movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
2358 punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
2360 movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
2361 movq_r2r(mm6, mm4);
2363 punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2364 movq_r2r(mm1, mm5);
2366 punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2367 movq_r2r(mm6, mm2);
2369 movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2370 paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2372 psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2373 punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2375 punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2376 movq_r2r(mm1, mm7);
2378 paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2379 psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2381 movq_r2r(mm6, mm5);
2382 punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2384 punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2385 movq_r2r(mm2, mm4);
2387 punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2389 punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2391 punpckhdq_r2r(mm6, mm4); // wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2393 punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2394 movq_r2r(mm0, mm5);
2396 punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2398 punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2399 movq_r2r(mm3, mm4);
2401 punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2402 movq_r2r(mm5, mm1);
2404 punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2405 // tmp7 = z11 + z13; /* phase 5 */
2406 // tmp8 = z11 - z13; /* phase 5 */
2407 psubw_r2r(mm4, mm1); // tmp8
2409 paddw_r2r(mm4, mm5); // tmp7
2410 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2411 psllw_i2r(2, mm1);
2413 psllw_i2r(2, mm0);
2415 pmulhw_m2r(fix_141, mm1); // tmp21
2416 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2417 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2418 psllw_i2r(2, mm3);
2419 movq_r2r(mm0, mm7);
2421 pmulhw_m2r(fix_n184, mm7);
2422 movq_r2r(mm3, mm6);
2424 movq_m2r(*(wsptr), mm2); // tmp0,final1
2426 pmulhw_m2r(fix_108n184, mm6);
2427 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2428 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2429 movq_r2r(mm2, mm4); // final1
2431 pmulhw_m2r(fix_184n261, mm0);
2432 paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2434 pmulhw_m2r(fix_184, mm3);
2435 psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2437 // tmp6 = tmp22 - tmp7; /* phase 2 */
2438 psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2440 paddw_r2r(mm6, mm7); // tmp20
2441 psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2443 paddw_r2r(mm0, mm3); // tmp22
2445 // tmp5 = tmp21 - tmp6;
2446 psubw_r2r(mm5, mm3); // tmp6
2448 // tmp4 = tmp20 + tmp5;
2449 movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2450 psubw_r2r(mm3, mm1); // tmp5
2452 movq_r2r(mm0, mm6); // final2
2453 paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2455 /* Final output stage: scale down by a factor of 8 and range-limit */
2457 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2458 // & RANGE_MASK];
2459 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2460 // & RANGE_MASK]; final1
2463 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2464 // & RANGE_MASK];
2465 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2466 // & RANGE_MASK]; final2
2467 psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2468 psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2470 psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2472 packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2474 movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2475 packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2477 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2478 // & RANGE_MASK];
2479 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2480 // & RANGE_MASK]; final3
2481 paddw_r2r(mm1, mm7); // tmp4
2482 movq_r2r(mm5, mm3);
2484 paddw_r2r(mm1, mm5); // tmp2+tmp5
2485 psubw_r2r(mm1, mm3); // tmp2-tmp5
2487 psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2489 movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2490 psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2494 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2495 // & RANGE_MASK];
2496 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2497 // & RANGE_MASK]; final4
2498 movq_r2r(mm4, mm6);
2499 paddw_r2r(mm7, mm4); // tmp3+tmp4
2501 psubw_r2r(mm7, mm6); // tmp3-tmp4
2502 psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2504 psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2507 movq_r2m(mm4, *dummy);
2508 fprintf(stderr, "3-4 %016llx\n", dummy);
2509 movq_r2m(mm4, *dummy);
2510 fprintf(stderr, "3+4 %016llx\n", dummy);
2514 packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2516 packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2517 movq_r2r(mm2, mm4);
2519 movq_r2r(mm5, mm7);
2520 punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2522 punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2523 movq_r2r(mm2, mm1);
2525 punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2527 punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2529 punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2531 movq_r2r(mm7, mm6);
2532 punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2534 movq_r2r(mm2, mm0);
2535 punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2537 punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2539 movq_r2r(mm1, mm3);
2541 punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2543 dataptr += rskip;
2544 movq_r2m(mm2, *(dataptr));
2546 punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2548 dataptr += rskip;
2549 movq_r2m(mm0, *(dataptr));
2551 punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2553 punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2555 dataptr += rskip;
2556 movq_r2m(mm1, *(dataptr));
2558 dataptr += rskip;
2559 movq_r2m(mm3, *(dataptr));
2561 #else
2562 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2563 __s32 tmp10, tmp11, tmp12, tmp13;
2564 __s32 z5, z10, z11, z12, z13;
2565 __s16 *inptr;
2566 __s32 *wsptr;
2567 __u8 *outptr;
2568 int ctr;
2569 __s32 dcval;
2570 __s32 workspace[64];
2572 inptr = data;
2573 wsptr = workspace;
2574 for (ctr = 8; ctr > 0; ctr--) {
2576 if ((inptr[8] | inptr[16] | inptr[24] |
2577 inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2578 dcval = inptr[0];
2579 wsptr[0] = dcval;
2580 wsptr[8] = dcval;
2581 wsptr[16] = dcval;
2582 wsptr[24] = dcval;
2583 wsptr[32] = dcval;
2584 wsptr[40] = dcval;
2585 wsptr[48] = dcval;
2586 wsptr[56] = dcval;
2588 inptr++;
2589 wsptr++;
2590 continue;
2593 tmp0 = inptr[0];
2594 tmp1 = inptr[16];
2595 tmp2 = inptr[32];
2596 tmp3 = inptr[48];
2598 tmp10 = tmp0 + tmp2;
2599 tmp11 = tmp0 - tmp2;
2601 tmp13 = tmp1 + tmp3;
2602 tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2604 tmp0 = tmp10 + tmp13;
2605 tmp3 = tmp10 - tmp13;
2606 tmp1 = tmp11 + tmp12;
2607 tmp2 = tmp11 - tmp12;
2609 tmp4 = inptr[8];
2610 tmp5 = inptr[24];
2611 tmp6 = inptr[40];
2612 tmp7 = inptr[56];
2614 z13 = tmp6 + tmp5;
2615 z10 = tmp6 - tmp5;
2616 z11 = tmp4 + tmp7;
2617 z12 = tmp4 - tmp7;
2619 tmp7 = z11 + z13;
2620 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2622 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2623 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2624 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2626 tmp6 = tmp12 - tmp7;
2627 tmp5 = tmp11 - tmp6;
2628 tmp4 = tmp10 + tmp5;
2630 wsptr[0] = (__s32) (tmp0 + tmp7);
2631 wsptr[56] = (__s32) (tmp0 - tmp7);
2632 wsptr[8] = (__s32) (tmp1 + tmp6);
2633 wsptr[48] = (__s32) (tmp1 - tmp6);
2634 wsptr[16] = (__s32) (tmp2 + tmp5);
2635 wsptr[40] = (__s32) (tmp2 - tmp5);
2636 wsptr[32] = (__s32) (tmp3 + tmp4);
2637 wsptr[24] = (__s32) (tmp3 - tmp4);
2639 inptr++;
2640 wsptr++;
2643 wsptr = workspace;
2644 for (ctr = 0; ctr < 8; ctr++) {
2645 outptr = &(odata[ctr*rskip]);
2647 tmp10 = wsptr[0] + wsptr[4];
2648 tmp11 = wsptr[0] - wsptr[4];
2650 tmp13 = wsptr[2] + wsptr[6];
2651 tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2653 tmp0 = tmp10 + tmp13;
2654 tmp3 = tmp10 - tmp13;
2655 tmp1 = tmp11 + tmp12;
2656 tmp2 = tmp11 - tmp12;
2658 z13 = wsptr[5] + wsptr[3];
2659 z10 = wsptr[5] - wsptr[3];
2660 z11 = wsptr[1] + wsptr[7];
2661 z12 = wsptr[1] - wsptr[7];
2663 tmp7 = z11 + z13;
2664 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2666 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2667 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2668 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2670 tmp6 = tmp12 - tmp7;
2671 tmp5 = tmp11 - tmp6;
2672 tmp4 = tmp10 + tmp5;
2674 outptr[0] = RL(DESCALE(tmp0 + tmp7));
2675 outptr[7] = RL(DESCALE(tmp0 - tmp7));
2676 outptr[1] = RL(DESCALE(tmp1 + tmp6));
2677 outptr[6] = RL(DESCALE(tmp1 - tmp6));
2678 outptr[2] = RL(DESCALE(tmp2 + tmp5));
2679 outptr[5] = RL(DESCALE(tmp2 - tmp5));
2680 outptr[4] = RL(DESCALE(tmp3 + tmp4));
2681 outptr[3] = RL(DESCALE(tmp3 - tmp4));
2683 wsptr += 8;
2685 #endif
2689 Main Routines
2691 This file contains most of the initialisation and control functions
2693 (C) Justin Schoeman 1998
2699 Private function
2701 Initialise all the cache-aliged data blocks
2705 void RTjpeg_init_data(void)
2707 unsigned long dptr;
2709 dptr=(unsigned long)&(RTjpeg_alldata[0]);
2710 dptr+=32;
2711 dptr=dptr>>5;
2712 dptr=dptr<<5; /* cache align data */
2714 RTjpeg_block=(__s16 *)dptr;
2715 dptr+=sizeof(__s16)*64;
2716 RTjpeg_lqt=(__s32 *)dptr;
2717 dptr+=sizeof(__s32)*64;
2718 RTjpeg_cqt=(__s32 *)dptr;
2719 dptr+=sizeof(__s32)*64;
2720 RTjpeg_liqt=(__u32 *)dptr;
2721 dptr+=sizeof(__u32)*64;
2722 RTjpeg_ciqt=(__u32 *)dptr;
2727 External Function
2729 Re-set quality factor
2731 Input: buf -> pointer to 128 ints for quant values store to pass back to
2732 init_decompress.
2733 Q -> quality factor (192=best, 32=worst)
2736 void RTjpeg_init_Q(__u8 Q)
2738 int i;
2739 __u64 qual;
2741 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2743 for(i=0; i<64; i++)
2745 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2746 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
2747 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2748 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
2749 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
2750 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
2751 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
2752 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
2755 RTjpeg_lb8=0;
2756 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2757 RTjpeg_lb8--;
2758 RTjpeg_cb8=0;
2759 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2760 RTjpeg_cb8--;
2762 RTjpeg_dct_init();
2763 RTjpeg_idct_init();
2764 RTjpeg_quant_init();
2769 External Function
2771 Initialise compression.
2773 Input: buf -> pointer to 128 ints for quant values store to pass back to
2774 init_decompress.
2775 width -> width of image
2776 height -> height of image
2777 Q -> quality factor (192=best, 32=worst)
2781 void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q)
2783 int i;
2784 __u64 qual;
2786 RTjpeg_init_data();
2788 RTjpeg_width=width;
2789 RTjpeg_height=height;
2790 RTjpeg_Ywidth = RTjpeg_width>>3;
2791 RTjpeg_Ysize=width * height;
2792 RTjpeg_Cwidth = RTjpeg_width>>4;
2793 RTjpeg_Csize= (width>>1) * height;
2795 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2797 for(i=0; i<64; i++)
2799 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2800 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
2801 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2802 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
2803 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
2804 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
2805 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
2806 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
2809 RTjpeg_lb8=0;
2810 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2811 RTjpeg_lb8--;
2812 RTjpeg_cb8=0;
2813 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2814 RTjpeg_cb8--;
2816 RTjpeg_dct_init();
2817 RTjpeg_quant_init();
2819 for(i=0; i<64; i++)
2820 buf[i]=le2me_32(RTjpeg_liqt[i]);
2821 for(i=0; i<64; i++)
2822 buf[64+i]=le2me_32(RTjpeg_ciqt[i]);
2825 void RTjpeg_init_decompress(__u32 *buf, int width, int height)
2827 int i;
2829 RTjpeg_init_data();
2831 RTjpeg_width=width;
2832 RTjpeg_height=height;
2833 RTjpeg_Ywidth = RTjpeg_width>>3;
2834 RTjpeg_Ysize=width * height;
2835 RTjpeg_Cwidth = RTjpeg_width>>4;
2836 RTjpeg_Csize= (width>>1) * height;
2838 for(i=0; i<64; i++)
2840 RTjpeg_liqt[i]=le2me_32(buf[i]);
2841 RTjpeg_ciqt[i]=le2me_32(buf[i+64]);
2844 RTjpeg_lb8=0;
2845 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2846 RTjpeg_lb8--;
2847 RTjpeg_cb8=0;
2848 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2849 RTjpeg_cb8--;
2851 RTjpeg_idct_init();
2853 // RTjpeg_color_init();
2856 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp)
2858 __s8 * sb;
2859 register __s8 * bp1 = bp + (RTjpeg_width<<3);
2860 register __s8 * bp2 = bp + RTjpeg_Ysize;
2861 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
2862 register int i, j, k;
2864 #if HAVE_MMX
2865 emms();
2866 #endif
2867 sb=sp;
2868 /* Y */
2869 for(i=RTjpeg_height>>1; i; i-=8)
2871 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2873 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2874 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2875 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2877 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2878 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2879 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2881 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
2882 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2883 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2885 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
2886 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2887 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2889 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2890 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2891 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2893 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2894 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2895 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2898 bp+=RTjpeg_width<<4;
2899 bp1+=RTjpeg_width<<4;
2900 bp2+=RTjpeg_width<<2;
2901 bp3+=RTjpeg_width<<2;
2904 #if HAVE_MMX
2905 emms();
2906 #endif
2907 return (sp-sb);
2910 int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp)
2912 __s8 * sb;
2913 register __s8 * bp2 = bp + RTjpeg_Ysize;
2914 register __s8 * bp3 = bp2 + RTjpeg_Csize;
2915 register int i, j, k;
2917 #if HAVE_MMX
2918 emms();
2919 #endif
2920 sb=sp;
2921 /* Y */
2922 for(i=RTjpeg_height; i; i-=8)
2924 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2926 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2927 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2928 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2930 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2931 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2932 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2934 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2935 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2936 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2938 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2939 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2940 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2943 bp+=RTjpeg_width<<3;
2944 bp2+=RTjpeg_width<<2;
2945 bp3+=RTjpeg_width<<2;
2948 #if HAVE_MMX
2949 emms();
2950 #endif
2951 return (sp-sb);
2954 int RTjpeg_compress8(__s8 *sp, unsigned char *bp)
2956 __s8 * sb;
2957 int i, j;
2959 #if HAVE_MMX
2960 emms();
2961 #endif
2963 sb=sp;
2964 /* Y */
2965 for(i=0; i<RTjpeg_height; i+=8)
2967 for(j=0; j<RTjpeg_width; j+=8)
2969 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
2970 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2971 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2973 bp+=RTjpeg_width;
2976 #if HAVE_MMX
2977 emms();
2978 #endif
2979 return (sp-sb);
2982 void RTjpeg_decompressYUV422(__s8 *sp, __u8 *bp)
2984 register __s8 * bp2 = bp + RTjpeg_Ysize;
2985 register __s8 * bp3 = bp2 + (RTjpeg_Csize);
2986 int i, j,k;
2988 #if HAVE_MMX
2989 emms();
2990 #endif
2992 /* Y */
2993 for(i=RTjpeg_height; i; i-=8)
2995 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
2996 if(*sp==-1)sp++;
2997 else
2999 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3000 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
3002 if(*sp==-1)sp++;
3003 else
3005 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3006 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
3008 if(*sp==-1)sp++;
3009 else
3011 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3012 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
3014 if(*sp==-1)sp++;
3015 else
3017 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3018 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
3021 bp+=RTjpeg_width<<3;
3022 bp2+=RTjpeg_width<<2;
3023 bp3+=RTjpeg_width<<2;
3025 #if HAVE_MMX
3026 emms();
3027 #endif
3030 void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp)
3032 register __s8 * bp1 = bp + (RTjpeg_width<<3);
3033 register __s8 * bp2 = bp + RTjpeg_Ysize;
3034 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
3035 int i, j,k;
3037 #if HAVE_MMX
3038 emms();
3039 #endif
3041 /* Y */
3042 for(i=RTjpeg_height>>1; i; i-=8)
3044 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
3045 if(*sp==-1)sp++;
3046 else
3048 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3049 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
3051 if(*sp==-1)sp++;
3052 else
3054 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3055 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
3057 if(*sp==-1)sp++;
3058 else
3060 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3061 RTjpeg_idct(bp1+j, RTjpeg_block, RTjpeg_width);
3063 if(*sp==-1)sp++;
3064 else
3066 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3067 RTjpeg_idct(bp1+j+8, RTjpeg_block, RTjpeg_width);
3069 if(*sp==-1)sp++;
3070 else
3072 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3073 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
3075 if(*sp==-1)sp++;
3076 else
3078 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3079 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
3082 bp+=RTjpeg_width<<4;
3083 bp1+=RTjpeg_width<<4;
3084 bp2+=RTjpeg_width<<2;
3085 bp3+=RTjpeg_width<<2;
3087 #if HAVE_MMX
3088 emms();
3089 #endif
3092 void RTjpeg_decompress8(__s8 *sp, __u8 *bp)
3094 int i, j;
3096 #if HAVE_MMX
3097 emms();
3098 #endif
3100 /* Y */
3101 for(i=0; i<RTjpeg_height; i+=8)
3103 for(j=0; j<RTjpeg_width; j+=8)
3104 if(*sp==-1)sp++;
3105 else
3107 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3108 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
3110 bp+=RTjpeg_width<<3;
3115 External Function
3117 Initialise additional data structures for motion compensation
3121 void RTjpeg_init_mcompress(void)
3123 unsigned long tmp;
3125 if(!RTjpeg_old)
3127 RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32);
3128 tmp=(unsigned long)RTjpeg_old;
3129 tmp+=32;
3130 tmp=tmp>>5;
3131 RTjpeg_old=(__s16 *)(tmp<<5);
3133 if (!RTjpeg_old)
3135 fprintf(stderr, "RTjpeg: Could not allocate memory\n");
3136 exit(-1);
3138 memset(RTjpeg_old, 0, ((4*RTjpeg_width*RTjpeg_height)));
3141 #if HAVE_MMX
3143 int RTjpeg_bcomp(__s16 *old, mmx_t *mask)
3145 int i;
3146 mmx_t *mold=(mmx_t *)old;
3147 mmx_t *mblock=(mmx_t *)RTjpeg_block;
3148 volatile mmx_t result;
3149 static mmx_t neg={0xffffffffffffffffULL};
3151 movq_m2r(*mask, mm7);
3152 movq_m2r(neg, mm6);
3153 pxor_r2r(mm5, mm5);
3155 for(i=0; i<8; i++)
3157 movq_m2r(*(mblock++), mm0);
3158 movq_m2r(*(mblock++), mm2);
3159 movq_m2r(*(mold++), mm1);
3160 movq_m2r(*(mold++), mm3);
3161 psubsw_r2r(mm1, mm0);
3162 psubsw_r2r(mm3, mm2);
3163 movq_r2r(mm0, mm1);
3164 movq_r2r(mm2, mm3);
3165 pcmpgtw_r2r(mm7, mm0);
3166 pcmpgtw_r2r(mm7, mm2);
3167 pxor_r2r(mm6, mm1);
3168 pxor_r2r(mm6, mm3);
3169 pcmpgtw_r2r(mm7, mm1);
3170 pcmpgtw_r2r(mm7, mm3);
3171 por_r2r(mm0, mm5);
3172 por_r2r(mm2, mm5);
3173 por_r2r(mm1, mm5);
3174 por_r2r(mm3, mm5);
3176 movq_r2m(mm5, result);
3178 if(result.q)
3180 // if(!RTjpeg_mtest)
3181 // for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
3182 return 0;
3184 // printf(".");
3185 return 1;
3188 #else
3189 int RTjpeg_bcomp(__s16 *old, __u16 *mask)
3191 int i;
3193 for(i=0; i<64; i++)
3194 if(abs(old[i]-RTjpeg_block[i])>*mask)
3196 if(!RTjpeg_mtest)
3197 for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
3198 return 0;
3200 return 1;
3202 #endif
3204 void RTjpeg_set_test(int i)
3206 RTjpeg_mtest=i;
3209 int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
3211 __s8 * sb;
3212 //rh __s16 *block;
3213 register __s8 * bp1 = bp + (RTjpeg_width<<3);
3214 register __s8 * bp2 = bp + RTjpeg_Ysize;
3215 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
3216 register int i, j, k;
3218 #if HAVE_MMX
3219 emms();
3220 RTjpeg_lmask.uq=((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask;
3221 RTjpeg_cmask.uq=((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask;
3222 #else
3223 RTjpeg_lmask=lmask;
3224 RTjpeg_cmask=cmask;
3225 #endif
3227 sb=sp;
3228 block=RTjpeg_old;
3229 /* Y */
3230 for(i=RTjpeg_height>>1; i; i-=8)
3232 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
3234 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
3235 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3236 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3238 *((__u8 *)sp++)=255;
3240 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3241 block+=64;
3243 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
3244 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3245 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3247 *((__u8 *)sp++)=255;
3249 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3250 block+=64;
3252 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
3253 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3254 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3256 *((__u8 *)sp++)=255;
3258 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3259 block+=64;
3261 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
3262 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3263 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3265 *((__u8 *)sp++)=255;
3267 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3268 block+=64;
3270 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
3271 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3272 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3274 *((__u8 *)sp++)=255;
3276 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3277 block+=64;
3279 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
3280 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3281 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3283 *((__u8 *)sp++)=255;
3285 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3286 block+=64;
3288 bp+=RTjpeg_width<<4;
3289 bp1+=RTjpeg_width<<4;
3290 bp2+=RTjpeg_width<<2;
3291 bp3+=RTjpeg_width<<2;
3294 #if HAVE_MMX
3295 emms();
3296 #endif
3297 return (sp-sb);
3301 int RTjpeg_mcompressYUV422(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
3303 __s8 * sb;
3304 __s16 *block;
3305 register __s8 * bp2;
3306 register __s8 * bp3;
3307 register int i, j, k;
3309 #if HAVE_MMX
3310 emms();
3311 RTjpeg_lmask.uq=((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask;
3312 RTjpeg_cmask.uq=((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask;
3313 #else
3314 RTjpeg_lmask=lmask;
3315 RTjpeg_cmask=cmask;
3316 #endif
3318 bp = bp - RTjpeg_width*0;
3319 bp2 = bp + RTjpeg_Ysize-RTjpeg_width*0;
3320 bp3 = bp2 + RTjpeg_Csize;
3322 sb=sp;
3323 block=RTjpeg_old;
3324 /* Y */
3325 for(i=RTjpeg_height; i; i-=8)
3327 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
3329 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
3330 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3331 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3333 *((__u8 *)sp++)=255;
3335 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3336 block+=64;
3338 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
3339 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3340 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3342 *((__u8 *)sp++)=255;
3344 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3345 block+=64;
3347 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
3348 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3349 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3351 *((__u8 *)sp++)=255;
3353 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3354 block+=64;
3356 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
3357 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3358 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3360 *((__u8 *)sp++)=255;
3362 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3363 block+=64;
3366 bp+=RTjpeg_width<<3;
3367 bp2+=RTjpeg_width<<2;
3368 bp3+=RTjpeg_width<<2;
3370 printf ("%d\n", block - RTjpeg_old);
3371 #if HAVE_MMX
3372 emms();
3373 #endif
3374 return (sp-sb);
3377 int RTjpeg_mcompress8(__s8 *sp, unsigned char *bp, __u16 lmask)
3379 __s8 * sb;
3380 __s16 *block;
3381 int i, j;
3383 #if HAVE_MMX
3384 emms();
3385 RTjpeg_lmask.uq=((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask;
3386 #else
3387 RTjpeg_lmask=lmask;
3388 #endif
3391 sb=sp;
3392 block=RTjpeg_old;
3393 /* Y */
3394 for(i=0; i<RTjpeg_height; i+=8)
3396 for(j=0; j<RTjpeg_width; j+=8)
3398 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
3399 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3400 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3402 *((__u8 *)sp++)=255;
3403 // printf("* %d ", sp[-1]);
3404 } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3405 block+=64;
3407 bp+=RTjpeg_width<<3;
3409 #if HAVE_MMX
3410 emms();
3411 #endif
3412 return (sp-sb);
3415 void RTjpeg_color_init(void)
3419 #define KcrR 76284
3420 #define KcrG 53281
3421 #define KcbG 25625
3422 #define KcbB 132252
3423 #define Ky 76284
3425 void RTjpeg_yuv422rgb(__u8 *buf, __u8 *rgb, int stride)
3427 int tmp;
3428 int i, j;
3429 __s32 y, crR, crG, cbG, cbB;
3430 __u8 *bufcr, *bufcb, *bufy, *bufoute;
3431 int yskip;
3433 yskip=RTjpeg_width;
3435 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3436 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
3437 bufy=&buf[0];
3438 bufoute=rgb;
3440 for(i=0; i<(RTjpeg_height); i++)
3442 for(j=0; j<RTjpeg_width; j+=2)
3444 crR=(*bufcr-128)*KcrR;
3445 crG=(*(bufcr++)-128)*KcrG;
3446 cbG=(*bufcb-128)*KcbG;
3447 cbB=(*(bufcb++)-128)*KcbB;
3449 y=(bufy[j]-16)*Ky;
3451 tmp=(y+crR)>>16;
3452 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3453 tmp=(y-crG-cbG)>>16;
3454 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3455 tmp=(y+cbB)>>16;
3456 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3458 y=(bufy[j+1]-16)*Ky;
3460 tmp=(y+crR)>>16;
3461 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3462 tmp=(y-crG-cbG)>>16;
3463 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3464 tmp=(y+cbB)>>16;
3465 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3468 bufy+=yskip;
3473 void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb, int stride)
3475 int tmp;
3476 int i, j;
3477 __s32 y, crR, crG, cbG, cbB;
3478 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3479 int oskip, yskip;
3481 if(stride==0)
3482 oskip=RTjpeg_width*3;
3483 else
3484 oskip=2*stride-RTjpeg_width*3;
3486 yskip=RTjpeg_width;
3488 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3489 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3490 bufy=&buf[0];
3491 bufoute=rgb;
3492 bufouto=rgb+RTjpeg_width*3;
3494 for(i=0; i<(RTjpeg_height>>1); i++)
3496 for(j=0; j<RTjpeg_width; j+=2)
3498 crR=(*bufcr-128)*KcrR;
3499 crG=(*(bufcr++)-128)*KcrG;
3500 cbG=(*bufcb-128)*KcbG;
3501 cbB=(*(bufcb++)-128)*KcbB;
3503 y=(bufy[j]-16)*Ky;
3505 tmp=(y+crR)>>16;
3506 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3507 tmp=(y-crG-cbG)>>16;
3508 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3509 tmp=(y+cbB)>>16;
3510 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3512 y=(bufy[j+1]-16)*Ky;
3514 tmp=(y+crR)>>16;
3515 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3516 tmp=(y-crG-cbG)>>16;
3517 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3518 tmp=(y+cbB)>>16;
3519 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3521 y=(bufy[j+yskip]-16)*Ky;
3523 tmp=(y+crR)>>16;
3524 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3525 tmp=(y-crG-cbG)>>16;
3526 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3527 tmp=(y+cbB)>>16;
3528 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3530 y=(bufy[j+1+yskip]-16)*Ky;
3532 tmp=(y+crR)>>16;
3533 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3534 tmp=(y-crG-cbG)>>16;
3535 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3536 tmp=(y+cbB)>>16;
3537 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3540 bufoute+=oskip;
3541 bufouto+=oskip;
3542 bufy+=yskip<<1;
3547 void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb, int stride)
3549 int tmp;
3550 int i, j;
3551 __s32 y, crR, crG, cbG, cbB;
3552 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3553 int oskip, yskip;
3555 if(stride==0)
3556 oskip=RTjpeg_width*4;
3557 else
3558 oskip = 2*stride-RTjpeg_width*4;
3559 yskip=RTjpeg_width;
3561 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3562 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
3563 bufy=&buf[0];
3564 bufoute=rgb;
3565 bufouto=rgb+RTjpeg_width*4;
3567 for(i=0; i<(RTjpeg_height>>1); i++)
3569 for(j=0; j<RTjpeg_width; j+=2)
3571 crR=(*bufcr-128)*KcrR;
3572 crG=(*(bufcr++)-128)*KcrG;
3573 cbG=(*bufcb-128)*KcbG;
3574 cbB=(*(bufcb++)-128)*KcbB;
3576 y=(bufy[j]-16)*Ky;
3578 tmp=(y+cbB)>>16;
3579 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3580 tmp=(y-crG-cbG)>>16;
3581 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3582 tmp=(y+crR)>>16;
3583 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3584 bufoute++;
3586 y=(bufy[j+1]-16)*Ky;
3588 tmp=(y+cbB)>>16;
3589 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3590 tmp=(y-crG-cbG)>>16;
3591 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3592 tmp=(y+crR)>>16;
3593 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3594 bufoute++;
3596 y=(bufy[j+yskip]-16)*Ky;
3598 tmp=(y+cbB)>>16;
3599 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3600 tmp=(y-crG-cbG)>>16;
3601 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3602 tmp=(y+crR)>>16;
3603 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3604 bufouto++;
3606 y=(bufy[j+1+yskip]-16)*Ky;
3608 tmp=(y+cbB)>>16;
3609 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3610 tmp=(y-crG-cbG)>>16;
3611 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3612 tmp=(y+crR)>>16;
3613 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3614 bufouto++;
3617 bufoute+=oskip;
3618 bufouto+=oskip;
3619 bufy+=yskip<<1;
3623 void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb, int stride)
3625 int tmp;
3626 int i, j;
3627 __s32 y, crR, crG, cbG, cbB;
3628 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3629 int oskip, yskip;
3631 if(stride==0)
3632 oskip=RTjpeg_width*3;
3633 else
3634 oskip=2*stride - RTjpeg_width*3;
3636 yskip=RTjpeg_width;
3638 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3639 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3640 bufy=&buf[0];
3641 bufoute=rgb;
3642 bufouto=rgb+RTjpeg_width*3;
3644 for(i=0; i<(RTjpeg_height>>1); i++)
3646 for(j=0; j<RTjpeg_width; j+=2)
3648 crR=(*bufcr-128)*KcrR;
3649 crG=(*(bufcr++)-128)*KcrG;
3650 cbG=(*bufcb-128)*KcbG;
3651 cbB=(*(bufcb++)-128)*KcbB;
3653 y=(bufy[j]-16)*Ky;
3655 tmp=(y+cbB)>>16;
3656 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3657 tmp=(y-crG-cbG)>>16;
3658 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3659 tmp=(y+crR)>>16;
3660 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3662 y=(bufy[j+1]-16)*Ky;
3664 tmp=(y+cbB)>>16;
3665 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3666 tmp=(y-crG-cbG)>>16;
3667 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3668 tmp=(y+crR)>>16;
3669 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3671 y=(bufy[j+yskip]-16)*Ky;
3673 tmp=(y+cbB)>>16;
3674 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3675 tmp=(y-crG-cbG)>>16;
3676 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3677 tmp=(y+crR)>>16;
3678 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3680 y=(bufy[j+1+yskip]-16)*Ky;
3682 tmp=(y+cbB)>>16;
3683 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3684 tmp=(y-crG-cbG)>>16;
3685 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3686 tmp=(y+crR)>>16;
3687 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3690 bufoute+=oskip;
3691 bufouto+=oskip;
3692 bufy+=yskip<<1;
3696 void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb, int stride)
3698 int tmp;
3699 int i, j;
3700 __s32 y, crR, crG, cbG, cbB;
3701 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3702 int oskip, yskip;
3703 unsigned char r, g, b;
3705 if(stride==0)
3706 oskip=RTjpeg_width*2;
3707 else
3708 oskip=2*stride-RTjpeg_width*2;
3710 yskip=RTjpeg_width;
3712 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3713 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3714 bufy=&buf[0];
3715 bufoute=rgb;
3716 bufouto=rgb+RTjpeg_width*2;
3718 for(i=0; i<(RTjpeg_height>>1); i++)
3720 for(j=0; j<RTjpeg_width; j+=2)
3722 crR=(*bufcr-128)*KcrR;
3723 crG=(*(bufcr++)-128)*KcrG;
3724 cbG=(*bufcb-128)*KcbG;
3725 cbB=(*(bufcb++)-128)*KcbB;
3727 y=(bufy[j]-16)*Ky;
3729 tmp=(y+cbB)>>16;
3730 b=(tmp>255)?255:((tmp<0)?0:tmp);
3731 tmp=(y-crG-cbG)>>16;
3732 g=(tmp>255)?255:((tmp<0)?0:tmp);
3733 tmp=(y+crR)>>16;
3734 r=(tmp>255)?255:((tmp<0)?0:tmp);
3735 tmp=(int)((int)b >> 3);
3736 tmp|=(int)(((int)g >> 2) << 5);
3737 tmp|=(int)(((int)r >> 3) << 11);
3738 *(bufoute++)=tmp&0xff;
3739 *(bufoute++)=tmp>>8;
3742 y=(bufy[j+1]-16)*Ky;
3744 tmp=(y+cbB)>>16;
3745 b=(tmp>255)?255:((tmp<0)?0:tmp);
3746 tmp=(y-crG-cbG)>>16;
3747 g=(tmp>255)?255:((tmp<0)?0:tmp);
3748 tmp=(y+crR)>>16;
3749 r=(tmp>255)?255:((tmp<0)?0:tmp);
3750 tmp=(int)((int)b >> 3);
3751 tmp|=(int)(((int)g >> 2) << 5);
3752 tmp|=(int)(((int)r >> 3) << 11);
3753 *(bufoute++)=tmp&0xff;
3754 *(bufoute++)=tmp>>8;
3756 y=(bufy[j+yskip]-16)*Ky;
3758 tmp=(y+cbB)>>16;
3759 b=(tmp>255)?255:((tmp<0)?0:tmp);
3760 tmp=(y-crG-cbG)>>16;
3761 g=(tmp>255)?255:((tmp<0)?0:tmp);
3762 tmp=(y+crR)>>16;
3763 r=(tmp>255)?255:((tmp<0)?0:tmp);
3764 tmp=(int)((int)b >> 3);
3765 tmp|=(int)(((int)g >> 2) << 5);
3766 tmp|=(int)(((int)r >> 3) << 11);
3767 *(bufouto++)=tmp&0xff;
3768 *(bufouto++)=tmp>>8;
3770 y=(bufy[j+1+yskip]-16)*Ky;
3772 tmp=(y+cbB)>>16;
3773 b=(tmp>255)?255:((tmp<0)?0:tmp);
3774 tmp=(y-crG-cbG)>>16;
3775 g=(tmp>255)?255:((tmp<0)?0:tmp);
3776 tmp=(y+crR)>>16;
3777 r=(tmp>255)?255:((tmp<0)?0:tmp);
3778 tmp=(int)((int)b >> 3);
3779 tmp|=(int)(((int)g >> 2) << 5);
3780 tmp|=(int)(((int)r >> 3) << 11);
3781 *(bufouto++)=tmp&0xff;
3782 *(bufouto++)=tmp>>8;
3785 bufoute+=oskip;
3786 bufouto+=oskip;
3787 bufy+=yskip<<1;
3791 /* fix stride */
3793 void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb, int stride)
3795 memcpy(rgb, buf, RTjpeg_width*RTjpeg_height);