2 RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
5 (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
7 (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
38 #define BETTERCOMPRESSION 1
40 static const unsigned char RTjpeg_ZZ
[64]={
46 40, 33, 26, 19, 12, 5,
47 6, 13, 20, 27, 34, 41, 48,
48 56, 49, 42, 35, 28, 21, 14, 7,
49 15, 22, 29, 36, 43, 50, 57,
50 58, 51, 44, 37, 30, 23,
57 static const __u64 RTjpeg_aan_tab
[64]={
58 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
59 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
60 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
61 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
62 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
63 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
64 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
65 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
69 static __s32 RTjpeg_ws
[64+31];
71 __u8 RTjpeg_alldata
[2*64+4*64+4*64+4*64+4*64+32];
73 static __s16
*block
; // rh
74 static __s16
*RTjpeg_block
;
75 static __s32
*RTjpeg_lqt
;
76 static __s32
*RTjpeg_cqt
;
77 static __u32
*RTjpeg_liqt
;
78 static __u32
*RTjpeg_ciqt
;
80 static unsigned char RTjpeg_lb8
;
81 static unsigned char RTjpeg_cb8
;
82 static int RTjpeg_width
, RTjpeg_height
;
83 static int RTjpeg_Ywidth
, RTjpeg_Cwidth
;
84 static int RTjpeg_Ysize
, RTjpeg_Csize
;
86 static __s16
*RTjpeg_old
=NULL
;
97 static const unsigned char RTjpeg_lum_quant_tbl
[64] = {
98 16, 11, 10, 16, 24, 40, 51, 61,
99 12, 12, 14, 19, 26, 58, 60, 55,
100 14, 13, 16, 24, 40, 57, 69, 56,
101 14, 17, 22, 29, 51, 87, 80, 62,
102 18, 22, 37, 56, 68, 109, 103, 77,
103 24, 35, 55, 64, 81, 104, 113, 92,
104 49, 64, 78, 87, 103, 121, 120, 101,
105 72, 92, 95, 98, 112, 100, 103, 99
108 static const unsigned char RTjpeg_chrom_quant_tbl
[64] = {
109 17, 18, 24, 47, 99, 99, 99, 99,
110 18, 21, 26, 66, 99, 99, 99, 99,
111 24, 26, 56, 99, 99, 99, 99, 99,
112 47, 66, 99, 99, 99, 99, 99, 99,
113 99, 99, 99, 99, 99, 99, 99, 99,
114 99, 99, 99, 99, 99, 99, 99, 99,
115 99, 99, 99, 99, 99, 99, 99, 99,
116 99, 99, 99, 99, 99, 99, 99, 99
119 #ifdef BETTERCOMPRESSION
121 /*--------------------------------------------------*/
122 /* better encoding, but needs a lot more cpu time */
123 /* seems to be more effective than old method +lzo */
124 /* with this encoding lzo isn't efficient anymore */
125 /* there is still more potential for better */
126 /* encoding but that would need even more cputime */
127 /* anyway your mileage may vary */
129 /* written by Martin BIELY and Roman HOCHLEITNER */
130 /*--------------------------------------------------*/
132 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
133 /* Block to Stream (encoding) */
136 int RTjpeg_b2s(__s16
*data
, __s8
*strm
, __u8 bt8
)
138 register int ci
, co
=1;
139 register __s16 ZZvalue
;
140 register unsigned char bitten
;
141 register unsigned char bitoff
;
146 for (ii
=0; ii
< 64; ii
++) {
147 fprintf(stdout
, "%d ", data
[RTjpeg_ZZ
[ii
]]);
149 fprintf(stdout
, "\n\n");
158 // first byte allways written
160 (__u8
)(data
[RTjpeg_ZZ
[0]]>254) ? 254:((data
[RTjpeg_ZZ
[0]]<0)?0:data
[RTjpeg_ZZ
[0]]);
164 while (data
[RTjpeg_ZZ
[ci
]]==0 && ci
>0) ci
--;
166 bitten
= ((unsigned char)ci
) << 2;
169 ((__u8
*)strm
)[1]= bitten
;
174 /* bitoff=0 because the high 6bit contain first non zero position */
180 ZZvalue
= data
[RTjpeg_ZZ
[ci
]];
186 bitten
|= (0x01<<bitoff
);
189 bitten
|= (0x03<<bitoff
);
192 bitten
|= (0x02<<bitoff
);
198 ((__u8
*)strm
)[co
]= bitten
;
202 } /* "fall through" */
210 ((__u8
*)strm
)[co
]= bitten
;
218 /* correct bitoff to nibble boundaries */
227 ((__u8
*)strm
)[co
]= bitten
;
230 bitten
= 0; // clear half nibble values in bitten
238 ZZvalue
= data
[RTjpeg_ZZ
[ci
]];
240 if( (ZZvalue
> 7) || (ZZvalue
< -7) ) {
241 bitten
|= (0x08<<bitoff
);
245 bitten
|= (ZZvalue
&0xf)<<bitoff
;
248 ((__u8
*)strm
)[co
]= bitten
;
258 ((__u8
*)strm
)[co
]= bitten
;
265 ((__u8
*)strm
)[co
]= bitten
;
269 /* bitting is over now we bite */
272 ZZvalue
= data
[RTjpeg_ZZ
[ci
]];
276 strm
[co
++]=(__s8
)(ZZvalue
>127)?127:ZZvalue
;
280 strm
[co
++]=(__s8
)(ZZvalue
<-128)?-128:ZZvalue
;
287 /* we gotoo much now we are ill */
291 fprintf(stdout
, "\nco = '%d'\n", co
);
292 for (i
=0; i
< co
+2; i
++) {
293 fprintf(stdout
, "%d ", strm
[i
]);
295 fprintf(stdout
, "\n\n");
302 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
303 /* Stream to Block (decoding) */
306 int RTjpeg_s2b(__s16
*data
, __s8
*strm
, __u8 bt8
, __u32
*qtbl
)
311 register unsigned char bitten
;
312 register unsigned char bitoff
;
314 /* first byte always read */
316 data
[i
]=((__u8
)strm
[0])*qtbl
[i
];
318 /* we start at the behind */
320 bitten
= ((unsigned char)strm
[1]) >> 2;
322 for(; co
> bitten
; co
--) {
324 data
[RTjpeg_ZZ
[co
]] = 0;
333 /* we have to read the last 2 bits of the second byte */
339 bitten
= ((unsigned char)strm
[ci
]) >> bitoff
;
368 /* data is written properly */
370 /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
377 /* correct bitoff to nibble */
385 /* we have to read from the next byte */
395 bitten
= ((unsigned char)strm
[ci
]) >> bitoff
;
400 if( bitten
== 0x08 ) {
404 /* the compiler cannot do sign extension for signed nibbles */
405 if( bitten
& 0x08 ) {
408 /* the unsigned char bitten now is a valid signed char */
410 data
[i
]=((signed char)bitten
)*qtbl
[i
];
420 /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
430 data
[i
]=strm
[ci
++]*qtbl
[i
];
433 /* ci now is the count, because it points to next element => no incrementing */
438 fprintf(stdout
, "\nci = '%d'\n", ci
);
439 for (i
=0; i
< 64; i
++) {
440 fprintf(stdout
, "%d ", data
[RTjpeg_ZZ
[i
]]);
442 fprintf(stdout
, "\n\n");
450 int RTjpeg_b2s(__s16
*data
, __s8
*strm
, __u8 bt8
)
452 register int ci
, co
=1, tmp
;
453 register __s16 ZZvalue
;
458 for (ii
=0; ii
< 64; ii
++) {
459 fprintf(stdout
, "%d ", data
[RTjpeg_ZZ
[ii
]]);
461 fprintf(stdout
, "\n\n");
465 (__u8
)strm
[0]=(__u8
)(data
[RTjpeg_ZZ
[0]]>254) ? 254:((data
[RTjpeg_ZZ
[0]]<0)?0:data
[RTjpeg_ZZ
[0]]);
467 for(ci
=1; ci
<=bt8
; ci
++)
469 ZZvalue
= data
[RTjpeg_ZZ
[ci
]];
473 strm
[co
++]=(__s8
)(ZZvalue
>127)?127:ZZvalue
;
477 strm
[co
++]=(__s8
)(ZZvalue
<-128)?-128:ZZvalue
;
483 ZZvalue
= data
[RTjpeg_ZZ
[ci
]];
487 strm
[co
++]=(__s8
)(ZZvalue
>63)?63:ZZvalue
;
491 strm
[co
++]=(__s8
)(ZZvalue
<-64)?-64:ZZvalue
;
493 else /* compress zeros */
500 while((ci
<64)&&(data
[RTjpeg_ZZ
[ci
]]==0));
502 strm
[co
++]=(__s8
)(63+(ci
-tmp
));
509 int RTjpeg_s2b(__s16
*data
, __s8
*strm
, __u8 bt8
, __u32
*qtbl
)
515 data
[i
]=((__u8
)strm
[0])*qtbl
[i
];
517 for(co
=1; co
<=bt8
; co
++)
520 data
[i
]=strm
[ci
++]*qtbl
[i
];
528 for(; co
<tmp
; co
++)data
[RTjpeg_ZZ
[co
]]=0;
533 data
[i
]=strm
[ci
]*qtbl
[i
];
542 void RTjpeg_quant_init(void)
547 qtbl
=(__s16
*)RTjpeg_lqt
;
548 for(i
=0; i
<64; i
++)qtbl
[i
]=(__s16
)RTjpeg_lqt
[i
];
550 qtbl
=(__s16
*)RTjpeg_cqt
;
551 for(i
=0; i
<64; i
++)qtbl
[i
]=(__s16
)RTjpeg_cqt
[i
];
554 static mmx_t RTjpeg_ones
={0x0001000100010001LL
};
555 static mmx_t RTjpeg_half
={0x7fff7fff7fff7fffLL
};
557 void RTjpeg_quant(__s16
*block
, __s32
*qtbl
)
565 movq_m2r(RTjpeg_ones
, mm6
);
566 movq_m2r(RTjpeg_half
, mm7
);
570 movq_m2r(*(ql
++), mm0
); /* quant vals (4) */
571 movq_m2r(*bl
, mm2
); /* block vals (4) */
575 punpcklwd_r2r(mm6
, mm0
); /* 1 qb 1 qa */
576 punpckhwd_r2r(mm6
, mm1
); /* 1 qd 1 qc */
578 punpcklwd_r2r(mm7
, mm2
); /* 32767 bb 32767 ba */
579 punpckhwd_r2r(mm7
, mm3
); /* 32767 bd 32767 bc */
581 pmaddwd_r2r(mm2
, mm0
); /* 32767+bb*qb 32767+ba*qa */
582 pmaddwd_r2r(mm3
, mm1
); /* 32767+bd*qd 32767+bc*qc */
587 packssdw_r2r(mm1
, mm0
);
589 movq_r2m(mm0
, *(bl
++));
594 void RTjpeg_quant_init(void)
598 void RTjpeg_quant(__s16
*block
, __s32
*qtbl
)
603 block
[i
]=(__s16
)((block
[i
]*qtbl
[i
]+32767)>>16);
608 * Perform the forward DCT on one block of samples.
611 static mmx_t RTjpeg_C4
={0x2D412D412D412D41LL
};
612 static mmx_t RTjpeg_C6
={0x187E187E187E187ELL
};
613 static mmx_t RTjpeg_C2mC6
={0x22A322A322A322A3LL
};
614 static mmx_t RTjpeg_C2pC6
={0x539F539F539F539FLL
};
615 static mmx_t RTjpeg_zero
={0x0000000000000000LL
};
619 #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */
620 #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */
621 #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */
622 #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */
624 #define DESCALE10(x) (__s16)( ((x)+128) >> 8)
625 #define DESCALE20(x) (__s16)(((x)+32768) >> 16)
626 #define D_MULTIPLY(var,const) ((__s32) ((var) * (const)))
629 void RTjpeg_dct_init(void)
635 RTjpeg_lqt
[i
]=(((__u64
)RTjpeg_lqt
[i
]<<32)/RTjpeg_aan_tab
[i
]);
636 RTjpeg_cqt
[i
]=(((__u64
)RTjpeg_cqt
[i
]<<32)/RTjpeg_aan_tab
[i
]);
640 void RTjpeg_dctY(__u8
*idata
, __s16
*odata
, int rskip
)
643 __s32 tmp0
, tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
;
644 __s32 tmp10
, tmp11
, tmp12
, tmp13
;
645 __s32 z1
, z2
, z3
, z4
, z5
, z11
, z13
;
653 for (ctr
= 7; ctr
>= 0; ctr
--) {
654 tmp0
= idataptr
[0] + idataptr
[7];
655 tmp7
= idataptr
[0] - idataptr
[7];
656 tmp1
= idataptr
[1] + idataptr
[6];
657 tmp6
= idataptr
[1] - idataptr
[6];
658 tmp2
= idataptr
[2] + idataptr
[5];
659 tmp5
= idataptr
[2] - idataptr
[5];
660 tmp3
= idataptr
[3] + idataptr
[4];
661 tmp4
= idataptr
[3] - idataptr
[4];
663 tmp10
= (tmp0
+ tmp3
); /* phase 2 */
665 tmp11
= (tmp1
+ tmp2
);
668 wsptr
[0] = (tmp10
+ tmp11
)<<8; /* phase 3 */
669 wsptr
[4] = (tmp10
- tmp11
)<<8;
671 z1
= D_MULTIPLY(tmp12
+ tmp13
, FIX_0_707106781
); /* c4 */
672 wsptr
[2] = (tmp13
<<8) + z1
; /* phase 5 */
673 wsptr
[6] = (tmp13
<<8) - z1
;
675 tmp10
= tmp4
+ tmp5
; /* phase 2 */
679 z5
= D_MULTIPLY(tmp10
- tmp12
, FIX_0_382683433
); /* c6 */
680 z2
= D_MULTIPLY(tmp10
, FIX_0_541196100
) + z5
; /* c2-c6 */
681 z4
= D_MULTIPLY(tmp12
, FIX_1_306562965
) + z5
; /* c2+c6 */
682 z3
= D_MULTIPLY(tmp11
, FIX_0_707106781
); /* c4 */
684 z11
= (tmp7
<<8) + z3
; /* phase 5 */
685 z13
= (tmp7
<<8) - z3
;
687 wsptr
[5] = z13
+ z2
; /* phase 6 */
692 idataptr
+= rskip
<<3; /* advance pointer to next row */
698 for (ctr
= 7; ctr
>= 0; ctr
--) {
699 tmp0
= wsptr
[0] + wsptr
[56];
700 tmp7
= wsptr
[0] - wsptr
[56];
701 tmp1
= wsptr
[8] + wsptr
[48];
702 tmp6
= wsptr
[8] - wsptr
[48];
703 tmp2
= wsptr
[16] + wsptr
[40];
704 tmp5
= wsptr
[16] - wsptr
[40];
705 tmp3
= wsptr
[24] + wsptr
[32];
706 tmp4
= wsptr
[24] - wsptr
[32];
708 tmp10
= tmp0
+ tmp3
; /* phase 2 */
713 odataptr
[0] = DESCALE10(tmp10
+ tmp11
); /* phase 3 */
714 odataptr
[32] = DESCALE10(tmp10
- tmp11
);
716 z1
= D_MULTIPLY(tmp12
+ tmp13
, FIX_0_707106781
); /* c4 */
717 odataptr
[16] = DESCALE20((tmp13
<<8) + z1
); /* phase 5 */
718 odataptr
[48] = DESCALE20((tmp13
<<8) - z1
);
720 tmp10
= tmp4
+ tmp5
; /* phase 2 */
724 z5
= D_MULTIPLY(tmp10
- tmp12
, FIX_0_382683433
); /* c6 */
725 z2
= D_MULTIPLY(tmp10
, FIX_0_541196100
) + z5
; /* c2-c6 */
726 z4
= D_MULTIPLY(tmp12
, FIX_1_306562965
) + z5
; /* c2+c6 */
727 z3
= D_MULTIPLY(tmp11
, FIX_0_707106781
); /* c4 */
729 z11
= (tmp7
<<8) + z3
; /* phase 5 */
730 z13
= (tmp7
<<8) - z3
;
732 odataptr
[40] = DESCALE20(z13
+ z2
); /* phase 6 */
733 odataptr
[24] = DESCALE20(z13
- z2
);
734 odataptr
[8] = DESCALE20(z11
+ z4
);
735 odataptr
[56] = DESCALE20(z11
- z4
);
737 odataptr
++; /* advance pointer to next column */
741 volatile mmx_t tmp6
, tmp7
;
742 register mmx_t
*dataptr
= (mmx_t
*)odata
;
743 mmx_t
*idata2
= (mmx_t
*)idata
;
745 // first copy the input 8 bit to the destination 16 bits
747 movq_m2r(RTjpeg_zero
, mm2
);
750 movq_m2r(*idata2
, mm0
);
753 punpcklbw_r2r(mm2
, mm0
);
754 movq_r2m(mm0
, *(dataptr
));
756 punpckhbw_r2r(mm2
, mm1
);
757 movq_r2m(mm1
, *(dataptr
+1));
761 movq_m2r(*idata2
, mm0
);
764 punpcklbw_r2r(mm2
, mm0
);
765 movq_r2m(mm0
, *(dataptr
+2));
767 punpckhbw_r2r(mm2
, mm1
);
768 movq_r2m(mm1
, *(dataptr
+3));
772 movq_m2r(*idata2
, mm0
);
775 punpcklbw_r2r(mm2
, mm0
);
776 movq_r2m(mm0
, *(dataptr
+4));
778 punpckhbw_r2r(mm2
, mm1
);
779 movq_r2m(mm1
, *(dataptr
+5));
783 movq_m2r(*idata2
, mm0
);
786 punpcklbw_r2r(mm2
, mm0
);
787 movq_r2m(mm0
, *(dataptr
+6));
789 punpckhbw_r2r(mm2
, mm1
);
790 movq_r2m(mm1
, *(dataptr
+7));
794 movq_m2r(*idata2
, mm0
);
797 punpcklbw_r2r(mm2
, mm0
);
798 movq_r2m(mm0
, *(dataptr
+8));
800 punpckhbw_r2r(mm2
, mm1
);
801 movq_r2m(mm1
, *(dataptr
+9));
805 movq_m2r(*idata2
, mm0
);
808 punpcklbw_r2r(mm2
, mm0
);
809 movq_r2m(mm0
, *(dataptr
+10));
811 punpckhbw_r2r(mm2
, mm1
);
812 movq_r2m(mm1
, *(dataptr
+11));
816 movq_m2r(*idata2
, mm0
);
819 punpcklbw_r2r(mm2
, mm0
);
820 movq_r2m(mm0
, *(dataptr
+12));
822 punpckhbw_r2r(mm2
, mm1
);
823 movq_r2m(mm1
, *(dataptr
+13));
827 movq_m2r(*idata2
, mm0
);
830 punpcklbw_r2r(mm2
, mm0
);
831 movq_r2m(mm0
, *(dataptr
+14));
833 punpckhbw_r2r(mm2
, mm1
);
834 movq_r2m(mm1
, *(dataptr
+15));
836 /* Start Transpose to do calculations on rows */
838 movq_m2r(*(dataptr
+9), mm7
); // m03:m02|m01:m00 - first line (line 4)and copy into m5
840 movq_m2r(*(dataptr
+13), mm6
); // m23:m22|m21:m20 - third line (line 6)and copy into m2
843 punpcklwd_m2r(*(dataptr
+11), mm7
); // m11:m01|m10:m00 - interleave first and second lines
846 punpcklwd_m2r(*(dataptr
+15), mm6
); // m31:m21|m30:m20 - interleave third and fourth lines
849 movq_m2r(*(dataptr
+11), mm3
); // m13:m13|m11:m10 - second line
850 punpckldq_r2r(mm6
, mm7
); // m30:m20|m10:m00 - interleave to produce result 1
852 movq_m2r(*(dataptr
+15), mm0
); // m13:m13|m11:m10 - fourth line
853 punpckhdq_r2r(mm6
, mm1
); // m31:m21|m11:m01 - interleave to produce result 2
855 movq_r2m(mm7
,*(dataptr
+9)); // write result 1
856 punpckhwd_r2r(mm3
, mm5
); // m13:m03|m12:m02 - interleave first and second lines
858 movq_r2m(mm1
,*(dataptr
+11)); // write result 2
859 punpckhwd_r2r(mm0
, mm2
); // m33:m23|m32:m22 - interleave third and fourth lines
862 punpckldq_r2r(mm2
, mm5
); // m32:m22|m12:m02 - interleave to produce result 3
864 movq_m2r(*(dataptr
+1), mm0
); // m03:m02|m01:m00 - first line, 4x4
865 punpckhdq_r2r(mm2
, mm1
); // m33:m23|m13:m03 - interleave to produce result 4
867 movq_r2m(mm5
,*(dataptr
+13)); // write result 3
871 movq_r2m(mm1
, *(dataptr
+15)); // write result 4, last 4x4
873 movq_m2r(*(dataptr
+5), mm2
); // m23:m22|m21:m20 - third line
876 punpcklwd_m2r(*(dataptr
+3), mm0
); // m11:m01|m10:m00 - interleave first and second lines
879 punpcklwd_m2r(*(dataptr
+7), mm2
); // m31:m21|m30:m20 - interleave third and fourth lines
883 movq_m2r(*(dataptr
+8), mm1
); // n03:n02|n01:n00 - first line
884 punpckldq_r2r(mm2
, mm0
); // m30:m20|m10:m00 - interleave to produce first result
886 movq_m2r(*(dataptr
+12), mm3
); // n23:n22|n21:n20 - third line
887 punpckhdq_r2r(mm2
, mm4
); // m31:m21|m11:m01 - interleave to produce second result
889 punpckhwd_m2r(*(dataptr
+3), mm6
); // m13:m03|m12:m02 - interleave first and second lines
890 movq_r2r(mm1
, mm2
); // copy first line
892 punpckhwd_m2r(*(dataptr
+7), mm7
); // m33:m23|m32:m22 - interleave third and fourth lines
893 movq_r2r(mm6
, mm5
); // copy first intermediate result
895 movq_r2m(mm0
, *(dataptr
+8)); // write result 1
896 punpckhdq_r2r(mm7
, mm5
); // m33:m23|m13:m03 - produce third result
898 punpcklwd_m2r(*(dataptr
+10), mm1
); // n11:n01|n10:n00 - interleave first and second lines
899 movq_r2r(mm3
, mm0
); // copy third line
901 punpckhwd_m2r(*(dataptr
+10), mm2
); // n13:n03|n12:n02 - interleave first and second lines
903 movq_r2m(mm4
, *(dataptr
+10)); // write result 2 out
904 punpckldq_r2r(mm7
, mm6
); // m32:m22|m12:m02 - produce fourth result
906 punpcklwd_m2r(*(dataptr
+14), mm3
); // n31:n21|n30:n20 - interleave third and fourth lines
909 movq_r2m(mm6
, *(dataptr
+12)); // write result 3 out
910 punpckldq_r2r(mm3
, mm1
); // n30:n20|n10:n00 - produce first result
912 punpckhwd_m2r(*(dataptr
+14), mm0
); // n33:n23|n32:n22 - interleave third and fourth lines
915 movq_r2m(mm5
, *(dataptr
+14)); // write result 4 out
916 punpckhdq_r2r(mm3
, mm4
); // n31:n21|n11:n01- produce second result
918 movq_r2m(mm1
, *(dataptr
+1)); // write result 5 out - (first result for other 4 x 4 block)
919 punpckldq_r2r(mm0
, mm2
); // n32:n22|n12:n02- produce third result
921 movq_r2m(mm4
, *(dataptr
+3)); // write result 6 out
922 punpckhdq_r2r(mm0
, mm6
); // n33:n23|n13:n03 - produce fourth result
924 movq_r2m(mm2
, *(dataptr
+5)); // write result 7 out
926 movq_m2r(*dataptr
, mm0
); // m03:m02|m01:m00 - first line, first 4x4
928 movq_r2m(mm6
, *(dataptr
+7)); // write result 8 out
931 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
933 movq_m2r(*(dataptr
+4), mm7
); // m23:m22|m21:m20 - third line
936 punpcklwd_m2r(*(dataptr
+2), mm0
); // m11:m01|m10:m00 - interleave first and second lines
939 punpcklwd_m2r(*(dataptr
+6), mm7
); // m31:m21|m30:m20 - interleave third and fourth lines
942 movq_m2r(*(dataptr
+2), mm6
); // m13:m12|m11:m10 - second line
943 punpckldq_r2r(mm7
, mm0
); // m30:m20|m10:m00 - interleave to produce result 1
945 movq_m2r(*(dataptr
+6), mm5
); // m33:m32|m31:m30 - fourth line
946 punpckhdq_r2r(mm7
, mm1
); // m31:m21|m11:m01 - interleave to produce result 2
948 movq_r2r(mm0
, mm7
); // write result 1
949 punpckhwd_r2r(mm6
, mm2
); // m13:m03|m12:m02 - interleave first and second lines
951 psubw_m2r(*(dataptr
+14), mm7
); // tmp07=x0-x7 /* Stage 1 */
952 movq_r2r(mm1
, mm6
); // write result 2
954 paddw_m2r(*(dataptr
+14), mm0
); // tmp00=x0+x7 /* Stage 1 */
955 punpckhwd_r2r(mm5
, mm4
); // m33:m23|m32:m22 - interleave third and fourth lines
957 paddw_m2r(*(dataptr
+12), mm1
); // tmp01=x1+x6 /* Stage 1 */
958 movq_r2r(mm2
, mm3
); // copy first intermediate result
960 psubw_m2r(*(dataptr
+12), mm6
); // tmp06=x1-x6 /* Stage 1 */
961 punpckldq_r2r(mm4
, mm2
); // m32:m22|m12:m02 - interleave to produce result 3
964 movq_r2r(mm2
, mm5
); // write result 3
967 punpckhdq_r2r(mm4
, mm3
); // m33:m23|m13:m03 - interleave to produce result 4
969 paddw_m2r(*(dataptr
+10), mm2
); // tmp02=x2+5 /* Stage 1 */
970 movq_r2r(mm3
, mm4
); // write result 4
972 /************************************************************************************************
974 ************************************************************************************************/
977 paddw_m2r(*(dataptr
+8), mm3
); // tmp03=x3+x4 /* stage 1*/
980 psubw_m2r(*(dataptr
+8), mm4
); // tmp04=x3-x4 /* stage 1*/
983 paddw_r2r(mm3
, mm0
); // tmp10 = tmp00 + tmp03 /* even 2 */
984 psubw_r2r(mm3
, mm7
); // tmp13 = tmp00 - tmp03 /* even 2 */
986 psubw_r2r(mm2
, mm6
); // tmp12 = tmp01 - tmp02 /* even 2 */
987 paddw_r2r(mm2
, mm1
); // tmp11 = tmp01 + tmp02 /* even 2 */
989 psubw_m2r(*(dataptr
+10), mm5
); // tmp05=x2-x5 /* stage 1*/
990 paddw_r2r(mm7
, mm6
); // tmp12 + tmp13
997 psllw_i2r(2, mm6
); // m8 * 2^2
1000 pmulhw_m2r(RTjpeg_C4
, mm6
); // z1
1001 psubw_r2r(mm1
, mm3
);
1003 movq_r2m(mm0
, *dataptr
);
1007 movq_r2m(mm3
, *(dataptr
+8));
1008 paddw_r2r(mm5
, mm4
); // tmp10
1010 movq_m2r(tmp7
, mm3
);
1011 paddw_r2r(mm6
, mm0
); // tmp32
1013 paddw_r2r(mm2
, mm5
); // tmp11
1014 psubw_r2r(mm6
, mm7
); // tmp33
1016 movq_r2m(mm0
, *(dataptr
+4));
1017 paddw_r2r(mm3
, mm2
); // tmp12
1021 movq_r2m(mm7
, *(dataptr
+12));
1022 movq_r2r(mm4
, mm1
); // copy of tmp10
1024 psubw_r2r(mm2
, mm1
); // tmp10 - tmp12
1025 psllw_i2r(2, mm4
); // m8 * 2^2
1027 movq_m2r(RTjpeg_C2mC6
, mm0
);
1030 pmulhw_m2r(RTjpeg_C6
, mm1
); // z5
1033 pmulhw_r2r(mm0
, mm4
); // z5
1037 pmulhw_m2r(RTjpeg_C2pC6
, mm2
);
1040 pmulhw_m2r(RTjpeg_C4
, mm5
); // z3
1041 movq_r2r(mm3
, mm0
); // copy tmp7
1043 movq_m2r(*(dataptr
+1), mm7
);
1044 paddw_r2r(mm1
, mm4
); // z2
1046 paddw_r2r(mm1
, mm2
); // z4
1048 paddw_r2r(mm5
, mm0
); // z11
1049 psubw_r2r(mm5
, mm3
); // z13
1053 movq_r2r(mm3
, mm5
); // copy z13
1054 psubw_r2r(mm4
, mm3
); // y3=z13 - z2
1056 paddw_r2r(mm4
, mm5
); // y5=z13 + z2
1057 movq_r2r(mm0
, mm6
); // copy z11
1059 movq_r2m(mm3
, *(dataptr
+6)); //save y3
1060 psubw_r2r(mm2
, mm0
); // y7=z11 - z4
1062 movq_r2m(mm5
, *(dataptr
+10)); //save y5
1063 paddw_r2r(mm2
, mm6
); // y1=z11 + z4
1065 movq_r2m(mm0
, *(dataptr
+14)); //save y7
1067 /************************************************
1069 ************************************************/
1071 movq_m2r(*(dataptr
+3), mm1
); // load x1 /* stage 1 */
1072 movq_r2r(mm7
, mm0
); // copy x0
1074 movq_r2m(mm6
, *(dataptr
+2)); //save y1
1076 movq_m2r(*(dataptr
+5), mm2
); // load x2 /* stage 1 */
1077 movq_r2r(mm1
, mm6
); // copy x1
1079 paddw_m2r(*(dataptr
+15), mm0
); // tmp00 = x0 + x7
1081 movq_m2r(*(dataptr
+7), mm3
); // load x3 /* stage 1 */
1082 movq_r2r(mm2
, mm5
); // copy x2
1084 psubw_m2r(*(dataptr
+15), mm7
); // tmp07 = x0 - x7
1085 movq_r2r(mm3
, mm4
); // copy x3
1087 paddw_m2r(*(dataptr
+13), mm1
); // tmp01 = x1 + x6
1089 movq_r2m(mm7
, tmp7
); // save tmp07
1090 movq_r2r(mm0
, mm7
); // copy tmp00
1092 psubw_m2r(*(dataptr
+13), mm6
); // tmp06 = x1 - x6
1094 /* stage 2, Even Part */
1096 paddw_m2r(*(dataptr
+9), mm3
); // tmp03 = x3 + x4
1098 movq_r2m(mm6
, tmp6
); // save tmp07
1099 movq_r2r(mm1
, mm6
); // copy tmp01
1101 paddw_m2r(*(dataptr
+11), mm2
); // tmp02 = x2 + x5
1102 paddw_r2r(mm3
, mm0
); // tmp10 = tmp00 + tmp03
1104 psubw_r2r(mm3
, mm7
); // tmp13 = tmp00 - tmp03
1106 psubw_m2r(*(dataptr
+9), mm4
); // tmp04 = x3 - x4
1107 psubw_r2r(mm2
, mm6
); // tmp12 = tmp01 - tmp02
1109 paddw_r2r(mm2
, mm1
); // tmp11 = tmp01 + tmp02
1111 psubw_m2r(*(dataptr
+11), mm5
); // tmp05 = x2 - x5
1112 paddw_r2r(mm7
, mm6
); // tmp12 + tmp13
1114 /* stage 3, Even and stage 4 & 5 even */
1116 movq_m2r(tmp6
, mm2
); // load tmp6
1117 movq_r2r(mm0
, mm3
); // copy tmp10
1119 psllw_i2r(2, mm6
); // shift z1
1120 paddw_r2r(mm1
, mm0
); // y0=tmp10 + tmp11
1122 pmulhw_m2r(RTjpeg_C4
, mm6
); // z1
1123 psubw_r2r(mm1
, mm3
); // y4=tmp10 - tmp11
1125 movq_r2m(mm0
, *(dataptr
+1)); //save y0
1126 movq_r2r(mm7
, mm0
); // copy tmp13
1130 movq_r2m(mm3
, *(dataptr
+9)); //save y4
1131 paddw_r2r(mm5
, mm4
); // tmp10 = tmp4 + tmp5
1133 movq_m2r(tmp7
, mm3
); // load tmp7
1134 paddw_r2r(mm6
, mm0
); // tmp32 = tmp13 + z1
1136 paddw_r2r(mm2
, mm5
); // tmp11 = tmp5 + tmp6
1137 psubw_r2r(mm6
, mm7
); // tmp33 = tmp13 - z1
1139 movq_r2m(mm0
, *(dataptr
+5)); //save y2
1140 paddw_r2r(mm3
, mm2
); // tmp12 = tmp6 + tmp7
1144 movq_r2m(mm7
, *(dataptr
+13)); //save y6
1145 movq_r2r(mm4
, mm1
); // copy tmp10
1147 psubw_r2r(mm2
, mm1
); // tmp10 - tmp12
1148 psllw_i2r(2, mm4
); // shift tmp10
1150 movq_m2r(RTjpeg_C2mC6
, mm0
); // load C2mC6
1151 psllw_i2r(2, mm1
); // shift (tmp10-tmp12)
1153 pmulhw_m2r(RTjpeg_C6
, mm1
); // z5
1154 psllw_i2r(2, mm5
); // prepare for multiply
1156 pmulhw_r2r(mm0
, mm4
); // multiply by converted real
1160 pmulhw_m2r(RTjpeg_C4
, mm5
); // z3
1161 psllw_i2r(2, mm2
); // prepare for multiply
1163 pmulhw_m2r(RTjpeg_C2pC6
, mm2
); // multiply
1164 movq_r2r(mm3
, mm0
); // copy tmp7
1166 movq_m2r(*(dataptr
+9), mm7
); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1167 paddw_r2r(mm1
, mm4
); // z2
1169 paddw_r2r(mm5
, mm0
); // z11
1170 psubw_r2r(mm5
, mm3
); // z13
1174 movq_r2r(mm3
, mm5
); // copy z13
1175 paddw_r2r(mm1
, mm2
); // z4
1177 movq_r2r(mm0
, mm6
); // copy z11
1178 psubw_r2r(mm4
, mm5
); // y3
1180 paddw_r2r(mm2
, mm6
); // y1
1181 paddw_r2r(mm4
, mm3
); // y5
1183 movq_r2m(mm5
, *(dataptr
+7)); //save y3
1185 movq_r2m(mm6
, *(dataptr
+3)); //save y1
1186 psubw_r2r(mm2
, mm0
); // y7
1188 /************************************************************************************************
1190 ************************************************************************************************/
1192 movq_m2r(*(dataptr
+13), mm6
); // m23:m22|m21:m20 - third line (line 6)and copy into m2
1193 movq_r2r(mm7
, mm5
); // copy first line
1195 punpcklwd_r2r(mm3
, mm7
); // m11:m01|m10:m00 - interleave first and second lines
1196 movq_r2r(mm6
, mm2
); // copy third line
1198 punpcklwd_r2r(mm0
, mm6
); // m31:m21|m30:m20 - interleave third and fourth lines
1199 movq_r2r(mm7
, mm1
); // copy first intermediate result
1201 punpckldq_r2r(mm6
, mm7
); // m30:m20|m10:m00 - interleave to produce result 1
1203 punpckhdq_r2r(mm6
, mm1
); // m31:m21|m11:m01 - interleave to produce result 2
1205 movq_r2m(mm7
, *(dataptr
+9)); // write result 1
1206 punpckhwd_r2r(mm3
, mm5
); // m13:m03|m12:m02 - interleave first and second lines
1208 movq_r2m(mm1
, *(dataptr
+11)); // write result 2
1209 punpckhwd_r2r(mm0
, mm2
); // m33:m23|m32:m22 - interleave third and fourth lines
1211 movq_r2r(mm5
, mm1
); // copy first intermediate result
1212 punpckldq_r2r(mm2
, mm5
); // m32:m22|m12:m02 - interleave to produce result 3
1214 movq_m2r(*(dataptr
+1), mm0
); // m03:m02|m01:m00 - first line, 4x4
1215 punpckhdq_r2r(mm2
, mm1
); // m33:m23|m13:m03 - interleave to produce result 4
1217 movq_r2m(mm5
, *(dataptr
+13)); // write result 3
1219 /****** last 4x4 done */
1221 movq_r2m(mm1
, *(dataptr
+15)); // write result 4, last 4x4
1223 movq_m2r(*(dataptr
+5), mm2
); // m23:m22|m21:m20 - third line
1224 movq_r2r(mm0
, mm6
); // copy first line
1226 punpcklwd_m2r(*(dataptr
+3), mm0
); // m11:m01|m10:m00 - interleave first and second lines
1227 movq_r2r(mm2
, mm7
); // copy third line
1229 punpcklwd_m2r(*(dataptr
+7), mm2
); // m31:m21|m30:m20 - interleave third and fourth lines
1230 movq_r2r(mm0
, mm4
); // copy first intermediate result
1234 movq_m2r(*(dataptr
+8), mm1
); // n03:n02|n01:n00 - first line
1235 punpckldq_r2r(mm2
, mm0
); // m30:m20|m10:m00 - interleave to produce first result
1237 movq_m2r(*(dataptr
+12), mm3
); // n23:n22|n21:n20 - third line
1238 punpckhdq_r2r(mm2
, mm4
); // m31:m21|m11:m01 - interleave to produce second result
1240 punpckhwd_m2r(*(dataptr
+3), mm6
); // m13:m03|m12:m02 - interleave first and second lines
1241 movq_r2r(mm1
, mm2
); // copy first line
1243 punpckhwd_m2r(*(dataptr
+7), mm7
); // m33:m23|m32:m22 - interleave third and fourth lines
1244 movq_r2r(mm6
, mm5
); // copy first intermediate result
1246 movq_r2m(mm0
, *(dataptr
+8)); // write result 1
1247 punpckhdq_r2r(mm7
, mm5
); // m33:m23|m13:m03 - produce third result
1249 punpcklwd_m2r(*(dataptr
+10), mm1
); // n11:n01|n10:n00 - interleave first and second lines
1250 movq_r2r(mm3
, mm0
); // copy third line
1252 punpckhwd_m2r(*(dataptr
+10), mm2
); // n13:n03|n12:n02 - interleave first and second lines
1254 movq_r2m(mm4
, *(dataptr
+10)); // write result 2 out
1255 punpckldq_r2r(mm7
, mm6
); // m32:m22|m12:m02 - produce fourth result
1257 punpcklwd_m2r(*(dataptr
+14), mm3
); // n33:n23|n32:n22 - interleave third and fourth lines
1258 movq_r2r(mm1
, mm4
); // copy second intermediate result
1260 movq_r2m(mm6
, *(dataptr
+12)); // write result 3 out
1261 punpckldq_r2r(mm3
, mm1
); //
1263 punpckhwd_m2r(*(dataptr
+14), mm0
); // n33:n23|n32:n22 - interleave third and fourth lines
1264 movq_r2r(mm2
, mm6
); // copy second intermediate result
1266 movq_r2m(mm5
, *(dataptr
+14)); // write result 4 out
1267 punpckhdq_r2r(mm3
, mm4
); // n31:n21|n11:n01- produce second result
1269 movq_r2m(mm1
, *(dataptr
+1)); // write result 5 out - (first result for other 4 x 4 block)
1270 punpckldq_r2r(mm0
, mm2
); // n32:n22|n12:n02- produce third result
1272 movq_r2m(mm4
, *(dataptr
+3)); // write result 6 out
1273 punpckhdq_r2r(mm0
, mm6
); // n33:n23|n13:n03 - produce fourth result
1275 movq_r2m(mm2
, *(dataptr
+5)); // write result 7 out
1277 movq_m2r(*dataptr
, mm0
); // m03:m02|m01:m00 - first line, first 4x4
1279 movq_r2m(mm6
, *(dataptr
+7)); // write result 8 out
1281 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
1283 movq_m2r(*(dataptr
+4), mm7
); // m23:m22|m21:m20 - third line
1284 movq_r2r(mm0
, mm2
); // copy first line
1286 punpcklwd_m2r(*(dataptr
+2), mm0
); // m11:m01|m10:m00 - interleave first and second lines
1287 movq_r2r(mm7
, mm4
); // copy third line
1289 punpcklwd_m2r(*(dataptr
+6), mm7
); // m31:m21|m30:m20 - interleave third and fourth lines
1290 movq_r2r(mm0
, mm1
); // copy first intermediate result
1292 movq_m2r(*(dataptr
+2), mm6
); // m13:m12|m11:m10 - second line
1293 punpckldq_r2r(mm7
, mm0
); // m30:m20|m10:m00 - interleave to produce result 1
1295 movq_m2r(*(dataptr
+6), mm5
); // m33:m32|m31:m30 - fourth line
1296 punpckhdq_r2r(mm7
, mm1
); // m31:m21|m11:m01 - interleave to produce result 2
1298 movq_r2r(mm0
, mm7
); // write result 1
1299 punpckhwd_r2r(mm6
, mm2
); // m13:m03|m12:m02 - interleave first and second lines
1301 psubw_m2r(*(dataptr
+14), mm7
); // tmp07=x0-x7 /* Stage 1 */
1302 movq_r2r(mm1
, mm6
); // write result 2
1304 paddw_m2r(*(dataptr
+14), mm0
); // tmp00=x0+x7 /* Stage 1 */
1305 punpckhwd_r2r(mm5
, mm4
); // m33:m23|m32:m22 - interleave third and fourth lines
1307 paddw_m2r(*(dataptr
+12), mm1
); // tmp01=x1+x6 /* Stage 1 */
1308 movq_r2r(mm2
, mm3
); // copy first intermediate result
1310 psubw_m2r(*(dataptr
+12), mm6
); // tmp06=x1-x6 /* Stage 1 */
1311 punpckldq_r2r(mm4
, mm2
); // m32:m22|m12:m02 - interleave to produce result 3
1313 movq_r2m(mm7
, tmp7
); // save tmp07
1314 movq_r2r(mm2
, mm5
); // write result 3
1316 movq_r2m(mm6
, tmp6
); // save tmp06
1318 punpckhdq_r2r(mm4
, mm3
); // m33:m23|m13:m03 - interleave to produce result 4
1320 paddw_m2r(*(dataptr
+10), mm2
); // tmp02=x2+x5 /* stage 1 */
1321 movq_r2r(mm3
, mm4
); // write result 4
1323 /************************************************************************************************
1325 ************************************************************************************************/
1327 paddw_m2r(*(dataptr
+8), mm3
); // tmp03=x3+x4 /* stage 1*/
1330 psubw_m2r(*(dataptr
+8), mm4
); // tmp04=x3-x4 /* stage 1*/
1333 paddw_r2r(mm3
, mm0
); // tmp10 = tmp00 + tmp03 /* even 2 */
1334 psubw_r2r(mm3
, mm7
); // tmp13 = tmp00 - tmp03 /* even 2 */
1336 psubw_r2r(mm2
, mm6
); // tmp12 = tmp01 - tmp02 /* even 2 */
1337 paddw_r2r(mm2
, mm1
); // tmp11 = tmp01 + tmp02 /* even 2 */
1339 psubw_m2r(*(dataptr
+10), mm5
); // tmp05=x2-x5 /* stage 1*/
1340 paddw_r2r(mm7
, mm6
); // tmp12 + tmp13
1344 movq_m2r(tmp6
, mm2
);
1347 psllw_i2r(2, mm6
); // m8 * 2^2
1348 paddw_r2r(mm1
, mm0
);
1350 pmulhw_m2r(RTjpeg_C4
, mm6
); // z1
1351 psubw_r2r(mm1
, mm3
);
1353 movq_r2m(mm0
, *dataptr
);
1357 movq_r2m(mm3
, *(dataptr
+8));
1358 paddw_r2r(mm5
, mm4
); // tmp10
1360 movq_m2r(tmp7
, mm3
);
1361 paddw_r2r(mm6
, mm0
); // tmp32
1363 paddw_r2r(mm2
, mm5
); // tmp11
1364 psubw_r2r(mm6
, mm7
); // tmp33
1366 movq_r2m(mm0
, *(dataptr
+4));
1367 paddw_r2r(mm3
, mm2
); // tmp12
1370 movq_r2m(mm7
, *(dataptr
+12));
1371 movq_r2r(mm4
, mm1
); // copy of tmp10
1373 psubw_r2r(mm2
, mm1
); // tmp10 - tmp12
1374 psllw_i2r(2, mm4
); // m8 * 2^2
1376 movq_m2r(RTjpeg_C2mC6
, mm0
);
1379 pmulhw_m2r(RTjpeg_C6
, mm1
); // z5
1382 pmulhw_r2r(mm0
, mm4
); // z5
1386 pmulhw_m2r(RTjpeg_C2pC6
, mm2
);
1389 pmulhw_m2r(RTjpeg_C4
, mm5
); // z3
1390 movq_r2r(mm3
, mm0
); // copy tmp7
1392 movq_m2r(*(dataptr
+1), mm7
);
1393 paddw_r2r(mm1
, mm4
); // z2
1395 paddw_r2r(mm1
, mm2
); // z4
1397 paddw_r2r(mm5
, mm0
); // z11
1398 psubw_r2r(mm5
, mm3
); // z13
1402 movq_r2r(mm3
, mm5
); // copy z13
1403 psubw_r2r(mm4
, mm3
); // y3=z13 - z2
1405 paddw_r2r(mm4
, mm5
); // y5=z13 + z2
1406 movq_r2r(mm0
, mm6
); // copy z11
1408 movq_r2m(mm3
, *(dataptr
+6)); //save y3
1409 psubw_r2r(mm2
, mm0
); // y7=z11 - z4
1411 movq_r2m(mm5
, *(dataptr
+10)); //save y5
1412 paddw_r2r(mm2
, mm6
); // y1=z11 + z4
1414 movq_r2m(mm0
, *(dataptr
+14)); //save y7
1416 /************************************************
1418 ************************************************/
1420 movq_m2r(*(dataptr
+3), mm1
); // load x1 /* stage 1 */
1421 movq_r2r(mm7
, mm0
); // copy x0
1423 movq_r2m(mm6
, *(dataptr
+2)); //save y1
1425 movq_m2r(*(dataptr
+5), mm2
); // load x2 /* stage 1 */
1426 movq_r2r(mm1
, mm6
); // copy x1
1428 paddw_m2r(*(dataptr
+15), mm0
); // tmp00 = x0 + x7
1430 movq_m2r(*(dataptr
+7), mm3
); // load x3 /* stage 1 */
1431 movq_r2r(mm2
, mm5
); // copy x2
1433 psubw_m2r(*(dataptr
+15), mm7
); // tmp07 = x0 - x7
1434 movq_r2r(mm3
, mm4
); // copy x3
1436 paddw_m2r(*(dataptr
+13), mm1
); // tmp01 = x1 + x6
1438 movq_r2m(mm7
, tmp7
); // save tmp07
1439 movq_r2r(mm0
, mm7
); // copy tmp00
1441 psubw_m2r(*(dataptr
+13), mm6
); // tmp06 = x1 - x6
1443 /* stage 2, Even Part */
1445 paddw_m2r(*(dataptr
+9), mm3
); // tmp03 = x3 + x4
1447 movq_r2m(mm6
, tmp6
); // save tmp07
1448 movq_r2r(mm1
, mm6
); // copy tmp01
1450 paddw_m2r(*(dataptr
+11), mm2
); // tmp02 = x2 + x5
1451 paddw_r2r(mm3
, mm0
); // tmp10 = tmp00 + tmp03
1453 psubw_r2r(mm3
, mm7
); // tmp13 = tmp00 - tmp03
1455 psubw_m2r(*(dataptr
+9), mm4
); // tmp04 = x3 - x4
1456 psubw_r2r(mm2
, mm6
); // tmp12 = tmp01 - tmp02
1458 paddw_r2r(mm2
, mm1
); // tmp11 = tmp01 + tmp02
1460 psubw_m2r(*(dataptr
+11), mm5
); // tmp05 = x2 - x5
1461 paddw_r2r(mm7
, mm6
); // tmp12 + tmp13
1463 /* stage 3, Even and stage 4 & 5 even */
1465 movq_m2r(tmp6
, mm2
); // load tmp6
1466 movq_r2r(mm0
, mm3
); // copy tmp10
1468 psllw_i2r(2, mm6
); // shift z1
1469 paddw_r2r(mm1
, mm0
); // y0=tmp10 + tmp11
1471 pmulhw_m2r(RTjpeg_C4
, mm6
); // z1
1472 psubw_r2r(mm1
, mm3
); // y4=tmp10 - tmp11
1474 movq_r2m(mm0
, *(dataptr
+1)); //save y0
1475 movq_r2r(mm7
, mm0
); // copy tmp13
1479 movq_r2m(mm3
, *(dataptr
+9)); //save y4
1480 paddw_r2r(mm5
, mm4
); // tmp10 = tmp4 + tmp5
1482 movq_m2r(tmp7
, mm3
); // load tmp7
1483 paddw_r2r(mm6
, mm0
); // tmp32 = tmp13 + z1
1485 paddw_r2r(mm2
, mm5
); // tmp11 = tmp5 + tmp6
1486 psubw_r2r(mm6
, mm7
); // tmp33 = tmp13 - z1
1488 movq_r2m(mm0
, *(dataptr
+5)); //save y2
1489 paddw_r2r(mm3
, mm2
); // tmp12 = tmp6 + tmp7
1493 movq_r2m(mm7
, *(dataptr
+13)); //save y6
1494 movq_r2r(mm4
, mm1
); // copy tmp10
1496 psubw_r2r(mm2
, mm1
); // tmp10 - tmp12
1497 psllw_i2r(2, mm4
); // shift tmp10
1499 movq_m2r(RTjpeg_C2mC6
, mm0
); // load C2mC6
1500 psllw_i2r(2, mm1
); // shift (tmp10-tmp12)
1502 pmulhw_m2r(RTjpeg_C6
, mm1
); // z5
1503 psllw_i2r(2, mm5
); // prepare for multiply
1505 pmulhw_r2r(mm0
, mm4
); // multiply by converted real
1509 pmulhw_m2r(RTjpeg_C4
, mm5
); // z3
1510 psllw_i2r(2, mm2
); // prepare for multiply
1512 pmulhw_m2r(RTjpeg_C2pC6
, mm2
); // multiply
1513 movq_r2r(mm3
, mm0
); // copy tmp7
1515 movq_m2r(*(dataptr
+9), mm7
); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1516 paddw_r2r(mm1
, mm4
); // z2
1518 paddw_r2r(mm5
, mm0
); // z11
1519 psubw_r2r(mm5
, mm3
); // z13
1523 movq_r2r(mm3
, mm5
); // copy z13
1524 paddw_r2r(mm1
, mm2
); // z4
1526 movq_r2r(mm0
, mm6
); // copy z11
1527 psubw_r2r(mm4
, mm5
); // y3
1529 paddw_r2r(mm2
, mm6
); // y1
1530 paddw_r2r(mm4
, mm3
); // y5
1532 movq_r2m(mm5
, *(dataptr
+7)); //save y3
1533 psubw_r2r(mm2
, mm0
); // yè=z11 - z4
1535 movq_r2m(mm3
, *(dataptr
+11)); //save y5
1537 movq_r2m(mm6
, *(dataptr
+3)); //save y1
1539 movq_r2m(mm0
, *(dataptr
+15)); //save y7
1545 #define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */
1546 #define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */
1547 #define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */
1548 #define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */
1550 #define DESCALE(x) (__s16)( ((x)+4) >> 3)
1552 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
1554 #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
1555 #define MULTIPLY(var,const) (((__s32) ((var) * (const)) + 128)>>8)
1557 void RTjpeg_idct_init(void)
1563 RTjpeg_liqt
[i
]=((__u64
)RTjpeg_liqt
[i
]*RTjpeg_aan_tab
[i
])>>32;
1564 RTjpeg_ciqt
[i
]=((__u64
)RTjpeg_ciqt
[i
]*RTjpeg_aan_tab
[i
])>>32;
1568 void RTjpeg_idct(__u8
*odata
, __s16
*data
, int rskip
)
1572 static mmx_t fix_141
= {0x5a825a825a825a82LL
};
1573 static mmx_t fix_184n261
= {0xcf04cf04cf04cf04LL
};
1574 static mmx_t fix_184
= {0x7641764176417641LL
};
1575 static mmx_t fix_n184
= {0x896f896f896f896fLL
};
1576 static mmx_t fix_108n184
= {0xcf04cf04cf04cf04LL
};
1578 mmx_t workspace
[64];
1579 mmx_t
*wsptr
= workspace
;
1580 register mmx_t
*dataptr
= (mmx_t
*)odata
;
1581 mmx_t
*idata
= (mmx_t
*)data
;
1585 * Perform inverse DCT on one block of coefficients.
1590 movq_m2r(*(idata
+10), mm1
); // load idata[DCTSIZE*5]
1592 movq_m2r(*(idata
+6), mm0
); // load idata[DCTSIZE*3]
1594 movq_m2r(*(idata
+2), mm3
); // load idata[DCTSIZE*1]
1596 movq_r2r(mm1
, mm2
); // copy tmp6 /* phase 6 */
1598 movq_m2r(*(idata
+14), mm4
); // load idata[DCTSIZE*7]
1600 paddw_r2r(mm0
, mm1
); // z13 = tmp6 + tmp5;
1602 psubw_r2r(mm0
, mm2
); // z10 = tmp6 - tmp5
1604 psllw_i2r(2, mm2
); // shift z10
1605 movq_r2r(mm2
, mm0
); // copy z10
1607 pmulhw_m2r(fix_184n261
, mm2
); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1608 movq_r2r(mm3
, mm5
); // copy tmp4
1610 pmulhw_m2r(fix_n184
, mm0
); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1611 paddw_r2r(mm4
, mm3
); // z11 = tmp4 + tmp7;
1613 movq_r2r(mm3
, mm6
); // copy z11 /* phase 5 */
1614 psubw_r2r(mm4
, mm5
); // z12 = tmp4 - tmp7;
1616 psubw_r2r(mm1
, mm6
); // z11-z13
1617 psllw_i2r(2, mm5
); // shift z12
1619 movq_m2r(*(idata
+12), mm4
); // load idata[DCTSIZE*6], even part
1620 movq_r2r(mm5
, mm7
); // copy z12
1622 pmulhw_m2r(fix_108n184
, mm5
); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1623 paddw_r2r(mm1
, mm3
); // tmp7 = z11 + z13;
1628 pmulhw_m2r(fix_184
, mm7
); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1631 movq_m2r(*(idata
+4), mm1
); // load idata[DCTSIZE*2]
1633 paddw_r2r(mm5
, mm0
); // tmp10
1635 paddw_r2r(mm7
, mm2
); // tmp12
1637 pmulhw_m2r(fix_141
, mm6
); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1638 psubw_r2r(mm3
, mm2
); // tmp6 = tmp12 - tmp7
1640 movq_r2r(mm1
, mm5
); // copy tmp1
1641 paddw_r2r(mm4
, mm1
); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1643 psubw_r2r(mm4
, mm5
); // tmp1-tmp3
1644 psubw_r2r(mm2
, mm6
); // tmp5 = tmp11 - tmp6;
1646 movq_r2m(mm1
, *(wsptr
)); // save tmp13 in workspace
1647 psllw_i2r(2, mm5
); // shift tmp1-tmp3
1649 movq_m2r(*(idata
), mm7
); // load idata[DCTSIZE*0]
1651 pmulhw_m2r(fix_141
, mm5
); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1652 paddw_r2r(mm6
, mm0
); // tmp4 = tmp10 + tmp5;
1654 movq_m2r(*(idata
+8), mm4
); // load idata[DCTSIZE*4]
1656 psubw_r2r(mm1
, mm5
); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1658 movq_r2m(mm0
, *(wsptr
+4)); // save tmp4 in workspace
1659 movq_r2r(mm7
, mm1
); // copy tmp0 /* phase 3 */
1661 movq_r2m(mm5
, *(wsptr
+2)); // save tmp12 in workspace
1662 psubw_r2r(mm4
, mm1
); // tmp11 = tmp0 - tmp2;
1664 paddw_r2r(mm4
, mm7
); // tmp10 = tmp0 + tmp2;
1665 movq_r2r(mm1
, mm5
); // copy tmp11
1667 paddw_m2r(*(wsptr
+2), mm1
); // tmp1 = tmp11 + tmp12;
1668 movq_r2r(mm7
, mm4
); // copy tmp10 /* phase 2 */
1670 paddw_m2r(*(wsptr
), mm7
); // tmp0 = tmp10 + tmp13;
1672 psubw_m2r(*(wsptr
), mm4
); // tmp3 = tmp10 - tmp13;
1673 movq_r2r(mm7
, mm0
); // copy tmp0
1675 psubw_m2r(*(wsptr
+2), mm5
); // tmp2 = tmp11 - tmp12;
1676 paddw_r2r(mm3
, mm7
); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1678 psubw_r2r(mm3
, mm0
); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1680 movq_r2m(mm7
, *(wsptr
)); // wsptr[DCTSIZE*0]
1681 movq_r2r(mm1
, mm3
); // copy tmp1
1683 movq_r2m(mm0
, *(wsptr
+14)); // wsptr[DCTSIZE*7]
1684 paddw_r2r(mm2
, mm1
); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1686 psubw_r2r(mm2
, mm3
); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1688 movq_r2m(mm1
, *(wsptr
+2)); // wsptr[DCTSIZE*1]
1689 movq_r2r(mm4
, mm1
); // copy tmp3
1691 movq_r2m(mm3
, *(wsptr
+12)); // wsptr[DCTSIZE*6]
1693 paddw_m2r(*(wsptr
+4), mm4
); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1695 psubw_m2r(*(wsptr
+4), mm1
); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1697 movq_r2m(mm4
, *(wsptr
+8));
1698 movq_r2r(mm5
, mm7
); // copy tmp2
1700 paddw_r2r(mm6
, mm5
); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1702 movq_r2m(mm1
, *(wsptr
+6));
1703 psubw_r2r(mm6
, mm7
); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1705 movq_r2m(mm5
, *(wsptr
+4));
1707 movq_r2m(mm7
, *(wsptr
+10));
1712 /*****************************************************************/
1717 /*****************************************************************/
1719 movq_m2r(*(idata
+10), mm1
); // load idata[DCTSIZE*5]
1721 movq_m2r(*(idata
+6), mm0
); // load idata[DCTSIZE*3]
1723 movq_m2r(*(idata
+2), mm3
); // load idata[DCTSIZE*1]
1724 movq_r2r(mm1
, mm2
); // copy tmp6 /* phase 6 */
1726 movq_m2r(*(idata
+14), mm4
); // load idata[DCTSIZE*7]
1727 paddw_r2r(mm0
, mm1
); // z13 = tmp6 + tmp5;
1729 psubw_r2r(mm0
, mm2
); // z10 = tmp6 - tmp5
1731 psllw_i2r(2, mm2
); // shift z10
1732 movq_r2r(mm2
, mm0
); // copy z10
1734 pmulhw_m2r(fix_184n261
, mm2
); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1735 movq_r2r(mm3
, mm5
); // copy tmp4
1737 pmulhw_m2r(fix_n184
, mm0
); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1738 paddw_r2r(mm4
, mm3
); // z11 = tmp4 + tmp7;
1740 movq_r2r(mm3
, mm6
); // copy z11 /* phase 5 */
1741 psubw_r2r(mm4
, mm5
); // z12 = tmp4 - tmp7;
1743 psubw_r2r(mm1
, mm6
); // z11-z13
1744 psllw_i2r(2, mm5
); // shift z12
1746 movq_m2r(*(idata
+12), mm4
); // load idata[DCTSIZE*6], even part
1747 movq_r2r(mm5
, mm7
); // copy z12
1749 pmulhw_m2r(fix_108n184
, mm5
); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1750 paddw_r2r(mm1
, mm3
); // tmp7 = z11 + z13;
1755 pmulhw_m2r(fix_184
, mm7
); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1758 movq_m2r(*(idata
+4), mm1
); // load idata[DCTSIZE*2]
1760 paddw_r2r(mm5
, mm0
); // tmp10
1762 paddw_r2r(mm7
, mm2
); // tmp12
1764 pmulhw_m2r(fix_141
, mm6
); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1765 psubw_r2r(mm3
, mm2
); // tmp6 = tmp12 - tmp7
1767 movq_r2r(mm1
, mm5
); // copy tmp1
1768 paddw_r2r(mm4
, mm1
); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1770 psubw_r2r(mm4
, mm5
); // tmp1-tmp3
1771 psubw_r2r(mm2
, mm6
); // tmp5 = tmp11 - tmp6;
1773 movq_r2m(mm1
, *(wsptr
)); // save tmp13 in workspace
1774 psllw_i2r(2, mm5
); // shift tmp1-tmp3
1776 movq_m2r(*(idata
), mm7
); // load idata[DCTSIZE*0]
1777 paddw_r2r(mm6
, mm0
); // tmp4 = tmp10 + tmp5;
1779 pmulhw_m2r(fix_141
, mm5
); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1781 movq_m2r(*(idata
+8), mm4
); // load idata[DCTSIZE*4]
1783 psubw_r2r(mm1
, mm5
); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1785 movq_r2m(mm0
, *(wsptr
+4)); // save tmp4 in workspace
1786 movq_r2r(mm7
, mm1
); // copy tmp0 /* phase 3 */
1788 movq_r2m(mm5
, *(wsptr
+2)); // save tmp12 in workspace
1789 psubw_r2r(mm4
, mm1
); // tmp11 = tmp0 - tmp2;
1791 paddw_r2r(mm4
, mm7
); // tmp10 = tmp0 + tmp2;
1792 movq_r2r(mm1
, mm5
); // copy tmp11
1794 paddw_m2r(*(wsptr
+2), mm1
); // tmp1 = tmp11 + tmp12;
1795 movq_r2r(mm7
, mm4
); // copy tmp10 /* phase 2 */
1797 paddw_m2r(*(wsptr
), mm7
); // tmp0 = tmp10 + tmp13;
1799 psubw_m2r(*(wsptr
), mm4
); // tmp3 = tmp10 - tmp13;
1800 movq_r2r(mm7
, mm0
); // copy tmp0
1802 psubw_m2r(*(wsptr
+2), mm5
); // tmp2 = tmp11 - tmp12;
1803 paddw_r2r(mm3
, mm7
); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1805 psubw_r2r(mm3
, mm0
); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1807 movq_r2m(mm7
, *(wsptr
)); // wsptr[DCTSIZE*0]
1808 movq_r2r(mm1
, mm3
); // copy tmp1
1810 movq_r2m(mm0
, *(wsptr
+14)); // wsptr[DCTSIZE*7]
1811 paddw_r2r(mm2
, mm1
); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1813 psubw_r2r(mm2
, mm3
); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1815 movq_r2m(mm1
, *(wsptr
+2)); // wsptr[DCTSIZE*1]
1816 movq_r2r(mm4
, mm1
); // copy tmp3
1818 movq_r2m(mm3
, *(wsptr
+12)); // wsptr[DCTSIZE*6]
1820 paddw_m2r(*(wsptr
+4), mm4
); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1822 psubw_m2r(*(wsptr
+4), mm1
); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1824 movq_r2m(mm4
, *(wsptr
+8));
1825 movq_r2r(mm5
, mm7
); // copy tmp2
1827 paddw_r2r(mm6
, mm5
); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1829 movq_r2m(mm1
, *(wsptr
+6));
1830 psubw_r2r(mm6
, mm7
); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1832 movq_r2m(mm5
, *(wsptr
+4));
1834 movq_r2m(mm7
, *(wsptr
+10));
1836 /*****************************************************************/
1838 /* Pass 2: process rows from work array, store into output array. */
1839 /* Note that we must descale the results by a factor of 8 == 2**3, */
1840 /* and also undo the PASS1_BITS scaling. */
1842 /*****************************************************************/
1847 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
1848 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
1849 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
1850 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
1851 movq_m2r(*(wsptr
), mm0
); // wsptr[0,0],[0,1],[0,2],[0,3]
1853 movq_m2r(*(wsptr
+1), mm1
); // wsptr[0,4],[0,5],[0,6],[0,7]
1856 movq_m2r(*(wsptr
+2), mm3
); // wsptr[1,0],[1,1],[1,2],[1,3]
1857 paddw_r2r(mm1
, mm0
); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
1859 movq_m2r(*(wsptr
+3), mm4
); // wsptr[1,4],[1,5],[1,6],[1,7]
1860 psubw_r2r(mm1
, mm2
); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
1865 paddw_r2r(mm4
, mm3
); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
1868 psubw_r2r(mm4
, mm5
); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
1869 punpcklwd_r2r(mm3
, mm0
); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
1871 movq_m2r(*(wsptr
+7), mm7
); // wsptr[3,4],[3,5],[3,6],[3,7]
1872 punpckhwd_r2r(mm3
, mm6
); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
1874 movq_m2r(*(wsptr
+4), mm3
); // wsptr[2,0],[2,1],[2,2],[2,3]
1875 punpckldq_r2r(mm6
, mm0
); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1877 punpcklwd_r2r(mm5
, mm1
); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
1880 movq_m2r(*(wsptr
+6), mm6
); // wsptr[3,0],[3,1],[3,2],[3,3]
1881 punpckhwd_r2r(mm5
, mm2
); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
1883 movq_m2r(*(wsptr
+5), mm5
); // wsptr[2,4],[2,5],[2,6],[2,7]
1884 punpckldq_r2r(mm2
, mm1
); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1887 paddw_r2r(mm5
, mm3
); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
1890 psubw_r2r(mm5
, mm4
); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
1891 paddw_r2r(mm7
, mm6
); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
1894 punpcklwd_r2r(mm6
, mm3
); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
1896 psubw_r2r(mm7
, mm2
); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
1897 punpckhwd_r2r(mm6
, mm5
); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
1900 punpckldq_r2r(mm5
, mm3
); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
1902 punpcklwd_r2r(mm2
, mm4
); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
1904 punpckhwd_r2r(mm2
, mm7
); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
1906 punpckldq_r2r(mm7
, mm4
); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
1911 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1912 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1916 punpckhdq_r2r(mm4
, mm6
); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
1918 punpckldq_r2r(mm4
, mm1
); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
1921 pmulhw_m2r(fix_141
, mm6
);
1922 punpckldq_r2r(mm3
, mm0
); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
1924 punpckhdq_r2r(mm3
, mm2
); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
1927 // tmp0 = tmp10 + tmp13;
1928 // tmp3 = tmp10 - tmp13;
1929 paddw_r2r(mm2
, mm0
); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
1930 psubw_r2r(mm2
, mm7
); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1932 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1933 psubw_r2r(mm2
, mm6
); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1934 // tmp1 = tmp11 + tmp12;
1935 // tmp2 = tmp11 - tmp12;
1942 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1943 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1944 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1945 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1946 movq_m2r(*(wsptr
), mm3
); // wsptr[0,0],[0,1],[0,2],[0,3]
1947 paddw_r2r(mm6
, mm1
); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1949 movq_m2r(*(wsptr
+1), mm4
); // wsptr[0,4],[0,5],[0,6],[0,7]
1950 psubw_r2r(mm6
, mm5
); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1953 punpckldq_r2r(mm4
, mm3
); // wsptr[0,0],[0,1],[0,4],[0,5]
1955 punpckhdq_r2r(mm6
, mm4
); // wsptr[0,6],[0,7],[0,2],[0,3]
1958 //Save tmp0 and tmp1 in wsptr
1959 movq_r2m(mm0
, *(wsptr
)); // save tmp0
1960 paddw_r2r(mm4
, mm2
); // wsptr[xxx],[0,z11],[xxx],[0,z13]
1963 //Continue with z10 --- z13
1964 movq_m2r(*(wsptr
+2), mm6
); // wsptr[1,0],[1,1],[1,2],[1,3]
1965 psubw_r2r(mm4
, mm3
); // wsptr[xxx],[0,z12],[xxx],[0,z10]
1967 movq_m2r(*(wsptr
+3), mm0
); // wsptr[1,4],[1,5],[1,6],[1,7]
1970 movq_r2m(mm1
, *(wsptr
+1)); // save tmp1
1971 punpckldq_r2r(mm0
, mm6
); // wsptr[1,0],[1,1],[1,4],[1,5]
1973 punpckhdq_r2r(mm4
, mm0
); // wsptr[1,6],[1,7],[1,2],[1,3]
1976 //Save tmp2 and tmp3 in wsptr
1977 paddw_r2r(mm0
, mm6
); // wsptr[xxx],[1,z11],[xxx],[1,z13]
1980 //Continue with z10 --- z13
1981 movq_r2m(mm5
, *(wsptr
+2)); // save tmp2
1982 punpcklwd_r2r(mm6
, mm2
); // wsptr[xxx],[xxx],[0,z11],[1,z11]
1984 psubw_r2r(mm0
, mm1
); // wsptr[xxx],[1,z12],[xxx],[1,z10]
1985 punpckhwd_r2r(mm6
, mm4
); // wsptr[xxx],[xxx],[0,z13],[1,z13]
1988 punpcklwd_r2r(mm1
, mm3
); // wsptr[xxx],[xxx],[0,z12],[1,z12]
1990 movq_r2m(mm7
, *(wsptr
+3)); // save tmp3
1991 punpckhwd_r2r(mm1
, mm0
); // wsptr[xxx],[xxx],[0,z10],[1,z10]
1993 movq_m2r(*(wsptr
+4), mm6
); // wsptr[2,0],[2,1],[2,2],[2,3]
1994 punpckhdq_r2r(mm2
, mm0
); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1996 movq_m2r(*(wsptr
+5), mm7
); // wsptr[2,4],[2,5],[2,6],[2,7]
1997 punpckhdq_r2r(mm4
, mm3
); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
1999 movq_m2r(*(wsptr
+6), mm1
); // wsptr[3,0],[3,1],[3,2],[3,3]
2002 punpckldq_r2r(mm7
, mm6
); // wsptr[2,0],[2,1],[2,4],[2,5]
2005 punpckhdq_r2r(mm4
, mm7
); // wsptr[2,6],[2,7],[2,2],[2,3]
2008 movq_m2r(*(wsptr
+7), mm4
); // wsptr[3,4],[3,5],[3,6],[3,7]
2009 paddw_r2r(mm7
, mm6
); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2011 psubw_r2r(mm7
, mm2
); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2012 punpckldq_r2r(mm4
, mm1
); // wsptr[3,0],[3,1],[3,4],[3,5]
2014 punpckhdq_r2r(mm5
, mm4
); // wsptr[3,6],[3,7],[3,2],[3,3]
2017 paddw_r2r(mm4
, mm1
); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2018 psubw_r2r(mm4
, mm7
); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2021 punpcklwd_r2r(mm1
, mm6
); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2023 punpckhwd_r2r(mm1
, mm5
); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2026 punpcklwd_r2r(mm7
, mm2
); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2028 punpckhwd_r2r(mm7
, mm4
); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2030 punpckhdq_r2r(mm6
, mm4
); /// wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2032 punpckhdq_r2r(mm5
, mm2
); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2035 punpckldq_r2r(mm4
, mm0
); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2037 punpckhdq_r2r(mm4
, mm5
); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2040 punpckhdq_r2r(mm2
, mm4
); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2043 punpckldq_r2r(mm2
, mm3
); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2044 // tmp7 = z11 + z13; /* phase 5 */
2045 // tmp8 = z11 - z13; /* phase 5 */
2046 psubw_r2r(mm4
, mm1
); // tmp8
2048 paddw_r2r(mm4
, mm5
); // tmp7
2049 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2054 pmulhw_m2r(fix_141
, mm1
); // tmp21
2055 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2056 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2060 pmulhw_m2r(fix_n184
, mm7
);
2063 movq_m2r(*(wsptr
), mm2
); // tmp0,final1
2065 pmulhw_m2r(fix_108n184
, mm6
);
2066 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2067 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2068 movq_r2r(mm2
, mm4
); // final1
2070 pmulhw_m2r(fix_184n261
, mm0
);
2071 paddw_r2r(mm5
, mm2
); // tmp0+tmp7,final1
2073 pmulhw_m2r(fix_184
, mm3
);
2074 psubw_r2r(mm5
, mm4
); // tmp0-tmp7,final1
2076 // tmp6 = tmp22 - tmp7; /* phase 2 */
2077 psraw_i2r(3, mm2
); // outptr[0,0],[1,0],[2,0],[3,0],final1
2079 paddw_r2r(mm6
, mm7
); // tmp20
2080 psraw_i2r(3, mm4
); // outptr[0,7],[1,7],[2,7],[3,7],final1
2082 paddw_r2r(mm0
, mm3
); // tmp22
2084 // tmp5 = tmp21 - tmp6;
2085 psubw_r2r(mm5
, mm3
); // tmp6
2087 // tmp4 = tmp20 + tmp5;
2088 movq_m2r(*(wsptr
+1), mm0
); // tmp1,final2
2089 psubw_r2r(mm3
, mm1
); // tmp5
2091 movq_r2r(mm0
, mm6
); // final2
2092 paddw_r2r(mm3
, mm0
); // tmp1+tmp6,final2
2094 /* Final output stage: scale down by a factor of 8 and range-limit */
2097 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2099 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2100 // & RANGE_MASK]; final1
2103 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2105 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2106 // & RANGE_MASK]; final2
2107 psubw_r2r(mm3
, mm6
); // tmp1-tmp6,final2
2108 psraw_i2r(3, mm0
); // outptr[0,1],[1,1],[2,1],[3,1]
2110 psraw_i2r(3, mm6
); // outptr[0,6],[1,6],[2,6],[3,6]
2112 packuswb_r2r(mm4
, mm0
); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2114 movq_m2r(*(wsptr
+2), mm5
); // tmp2,final3
2115 packuswb_r2r(mm6
, mm2
); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2117 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2119 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2120 // & RANGE_MASK]; final3
2121 paddw_r2r(mm1
, mm7
); // tmp4
2124 paddw_r2r(mm1
, mm5
); // tmp2+tmp5
2125 psubw_r2r(mm1
, mm3
); // tmp2-tmp5
2127 psraw_i2r(3, mm5
); // outptr[0,2],[1,2],[2,2],[3,2]
2129 movq_m2r(*(wsptr
+3), mm4
); // tmp3,final4
2130 psraw_i2r(3, mm3
); // outptr[0,5],[1,5],[2,5],[3,5]
2134 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2136 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2137 // & RANGE_MASK]; final4
2139 paddw_r2r(mm7
, mm4
); // tmp3+tmp4
2141 psubw_r2r(mm7
, mm6
); // tmp3-tmp4
2142 psraw_i2r(3, mm4
); // outptr[0,4],[1,4],[2,4],[3,4]
2144 // mov ecx, [dataptr]
2146 psraw_i2r(3, mm6
); // outptr[0,3],[1,3],[2,3],[3,3]
2148 packuswb_r2r(mm4
, mm5
); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2150 packuswb_r2r(mm3
, mm6
); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2154 punpcklbw_r2r(mm0
, mm2
); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2156 punpckhbw_r2r(mm0
, mm4
); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2159 punpcklbw_r2r(mm6
, mm5
); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2163 punpckhbw_r2r(mm6
, mm7
); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2165 punpcklwd_r2r(mm5
, mm2
); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2167 // add ecx, output_col
2170 punpckhwd_r2r(mm5
, mm1
); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2173 punpcklwd_r2r(mm4
, mm6
); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2175 // mov idata, [dataptr]
2177 punpckldq_r2r(mm6
, mm2
); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2183 // add idata, output_col
2185 punpckhwd_r2r(mm4
, mm7
); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2187 movq_r2m(mm2
, *(dataptr
));
2189 punpckhdq_r2r(mm6
, mm0
); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2192 movq_r2m(mm0
, *(dataptr
));
2194 punpckldq_r2r(mm7
, mm1
); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2195 punpckhdq_r2r(mm7
, mm3
); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2198 movq_r2m(mm1
, *(dataptr
));
2201 movq_r2m(mm3
, *(dataptr
));
2203 /*******************************************************************/
2207 /*******************************************************************/
2209 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
2210 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
2211 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
2212 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
2213 movq_m2r(*(wsptr
), mm0
); // wsptr[0,0],[0,1],[0,2],[0,3]
2215 movq_m2r(*(wsptr
+1), mm1
); // wsptr[0,4],[0,5],[0,6],[0,7]
2218 movq_m2r(*(wsptr
+2), mm3
); // wsptr[1,0],[1,1],[1,2],[1,3]
2219 paddw_r2r(mm1
, mm0
); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
2221 movq_m2r(*(wsptr
+3), mm4
); // wsptr[1,4],[1,5],[1,6],[1,7]
2222 psubw_r2r(mm1
, mm2
); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
2227 paddw_r2r(mm4
, mm3
); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
2230 psubw_r2r(mm4
, mm5
); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
2231 punpcklwd_r2r(mm3
, mm0
); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
2233 movq_m2r(*(wsptr
+7), mm7
); // wsptr[3,4],[3,5],[3,6],[3,7]
2234 punpckhwd_r2r(mm3
, mm6
); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
2236 movq_m2r(*(wsptr
+4), mm3
); // wsptr[2,0],[2,1],[2,2],[2,3]
2237 punpckldq_r2r(mm6
, mm0
); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2239 punpcklwd_r2r(mm5
, mm1
); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
2242 movq_m2r(*(wsptr
+6), mm6
); // wsptr[3,0],[3,1],[3,2],[3,3]
2243 punpckhwd_r2r(mm5
, mm2
); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
2245 movq_m2r(*(wsptr
+5), mm5
); // wsptr[2,4],[2,5],[2,6],[2,7]
2246 punpckldq_r2r(mm2
, mm1
); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2248 paddw_r2r(mm5
, mm3
); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
2251 psubw_r2r(mm5
, mm4
); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
2252 paddw_r2r(mm7
, mm6
); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
2255 punpcklwd_r2r(mm6
, mm3
); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
2257 psubw_r2r(mm7
, mm2
); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
2258 punpckhwd_r2r(mm6
, mm5
); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
2261 punpckldq_r2r(mm5
, mm3
); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
2263 punpcklwd_r2r(mm2
, mm4
); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
2265 punpckhwd_r2r(mm2
, mm7
); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
2267 punpckldq_r2r(mm7
, mm4
); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
2272 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2273 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2276 punpckhdq_r2r(mm4
, mm6
); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
2278 punpckldq_r2r(mm4
, mm1
); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
2281 pmulhw_m2r(fix_141
, mm6
);
2282 punpckldq_r2r(mm3
, mm0
); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
2284 punpckhdq_r2r(mm3
, mm2
); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
2287 // tmp0 = tmp10 + tmp13;
2288 // tmp3 = tmp10 - tmp13;
2289 paddw_r2r(mm2
, mm0
); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
2290 psubw_r2r(mm2
, mm7
); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
2292 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
2293 psubw_r2r(mm2
, mm6
); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
2294 // tmp1 = tmp11 + tmp12;
2295 // tmp2 = tmp11 - tmp12;
2303 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
2304 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
2305 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
2306 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
2307 movq_m2r(*(wsptr
), mm3
); // wsptr[0,0],[0,1],[0,2],[0,3]
2308 paddw_r2r(mm6
, mm1
); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
2310 movq_m2r(*(wsptr
+1), mm4
); // wsptr[0,4],[0,5],[0,6],[0,7]
2311 psubw_r2r(mm6
, mm5
); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
2314 punpckldq_r2r(mm4
, mm3
); // wsptr[0,0],[0,1],[0,4],[0,5]
2316 punpckhdq_r2r(mm6
, mm4
); // wsptr[0,6],[0,7],[0,2],[0,3]
2319 //Save tmp0 and tmp1 in wsptr
2320 movq_r2m(mm0
, *(wsptr
)); // save tmp0
2321 paddw_r2r(mm4
, mm2
); // wsptr[xxx],[0,z11],[xxx],[0,z13]
2324 //Continue with z10 --- z13
2325 movq_m2r(*(wsptr
+2), mm6
); // wsptr[1,0],[1,1],[1,2],[1,3]
2326 psubw_r2r(mm4
, mm3
); // wsptr[xxx],[0,z12],[xxx],[0,z10]
2328 movq_m2r(*(wsptr
+3), mm0
); // wsptr[1,4],[1,5],[1,6],[1,7]
2331 movq_r2m(mm1
, *(wsptr
+1)); // save tmp1
2332 punpckldq_r2r(mm0
, mm6
); // wsptr[1,0],[1,1],[1,4],[1,5]
2334 punpckhdq_r2r(mm4
, mm0
); // wsptr[1,6],[1,7],[1,2],[1,3]
2337 //Save tmp2 and tmp3 in wsptr
2338 paddw_r2r(mm0
, mm6
); // wsptr[xxx],[1,z11],[xxx],[1,z13]
2341 //Continue with z10 --- z13
2342 movq_r2m(mm5
, *(wsptr
+2)); // save tmp2
2343 punpcklwd_r2r(mm6
, mm2
); // wsptr[xxx],[xxx],[0,z11],[1,z11]
2345 psubw_r2r(mm0
, mm1
); // wsptr[xxx],[1,z12],[xxx],[1,z10]
2346 punpckhwd_r2r(mm6
, mm4
); // wsptr[xxx],[xxx],[0,z13],[1,z13]
2349 punpcklwd_r2r(mm1
, mm3
); // wsptr[xxx],[xxx],[0,z12],[1,z12]
2351 movq_r2m(mm7
, *(wsptr
+3)); // save tmp3
2352 punpckhwd_r2r(mm1
, mm0
); // wsptr[xxx],[xxx],[0,z10],[1,z10]
2354 movq_m2r(*(wsptr
+4), mm6
); // wsptr[2,0],[2,1],[2,2],[2,3]
2355 punpckhdq_r2r(mm2
, mm0
); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
2357 movq_m2r(*(wsptr
+5), mm7
); // wsptr[2,4],[2,5],[2,6],[2,7]
2358 punpckhdq_r2r(mm4
, mm3
); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
2360 movq_m2r(*(wsptr
+6), mm1
); // wsptr[3,0],[3,1],[3,2],[3,3]
2363 punpckldq_r2r(mm7
, mm6
); // wsptr[2,0],[2,1],[2,4],[2,5]
2366 punpckhdq_r2r(mm4
, mm7
); // wsptr[2,6],[2,7],[2,2],[2,3]
2369 movq_m2r(*(wsptr
+7), mm4
); // wsptr[3,4],[3,5],[3,6],[3,7]
2370 paddw_r2r(mm7
, mm6
); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2372 psubw_r2r(mm7
, mm2
); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2373 punpckldq_r2r(mm4
, mm1
); // wsptr[3,0],[3,1],[3,4],[3,5]
2375 punpckhdq_r2r(mm5
, mm4
); // wsptr[3,6],[3,7],[3,2],[3,3]
2378 paddw_r2r(mm4
, mm1
); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2379 psubw_r2r(mm4
, mm7
); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2382 punpcklwd_r2r(mm1
, mm6
); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2384 punpckhwd_r2r(mm1
, mm5
); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2387 punpcklwd_r2r(mm7
, mm2
); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2389 punpckhwd_r2r(mm7
, mm4
); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2391 punpckhdq_r2r(mm6
, mm4
); // wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2393 punpckhdq_r2r(mm5
, mm2
); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2396 punpckldq_r2r(mm4
, mm0
); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2398 punpckhdq_r2r(mm4
, mm5
); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2401 punpckhdq_r2r(mm2
, mm4
); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2404 punpckldq_r2r(mm2
, mm3
); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2405 // tmp7 = z11 + z13; /* phase 5 */
2406 // tmp8 = z11 - z13; /* phase 5 */
2407 psubw_r2r(mm4
, mm1
); // tmp8
2409 paddw_r2r(mm4
, mm5
); // tmp7
2410 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2415 pmulhw_m2r(fix_141
, mm1
); // tmp21
2416 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2417 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2421 pmulhw_m2r(fix_n184
, mm7
);
2424 movq_m2r(*(wsptr
), mm2
); // tmp0,final1
2426 pmulhw_m2r(fix_108n184
, mm6
);
2427 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2428 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2429 movq_r2r(mm2
, mm4
); // final1
2431 pmulhw_m2r(fix_184n261
, mm0
);
2432 paddw_r2r(mm5
, mm2
); // tmp0+tmp7,final1
2434 pmulhw_m2r(fix_184
, mm3
);
2435 psubw_r2r(mm5
, mm4
); // tmp0-tmp7,final1
2437 // tmp6 = tmp22 - tmp7; /* phase 2 */
2438 psraw_i2r(3, mm2
); // outptr[0,0],[1,0],[2,0],[3,0],final1
2440 paddw_r2r(mm6
, mm7
); // tmp20
2441 psraw_i2r(3, mm4
); // outptr[0,7],[1,7],[2,7],[3,7],final1
2443 paddw_r2r(mm0
, mm3
); // tmp22
2445 // tmp5 = tmp21 - tmp6;
2446 psubw_r2r(mm5
, mm3
); // tmp6
2448 // tmp4 = tmp20 + tmp5;
2449 movq_m2r(*(wsptr
+1), mm0
); // tmp1,final2
2450 psubw_r2r(mm3
, mm1
); // tmp5
2452 movq_r2r(mm0
, mm6
); // final2
2453 paddw_r2r(mm3
, mm0
); // tmp1+tmp6,final2
2455 /* Final output stage: scale down by a factor of 8 and range-limit */
2457 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2459 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2460 // & RANGE_MASK]; final1
2463 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2465 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2466 // & RANGE_MASK]; final2
2467 psubw_r2r(mm3
, mm6
); // tmp1-tmp6,final2
2468 psraw_i2r(3, mm0
); // outptr[0,1],[1,1],[2,1],[3,1]
2470 psraw_i2r(3, mm6
); // outptr[0,6],[1,6],[2,6],[3,6]
2472 packuswb_r2r(mm4
, mm0
); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2474 movq_m2r(*(wsptr
+2), mm5
); // tmp2,final3
2475 packuswb_r2r(mm6
, mm2
); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2477 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2479 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2480 // & RANGE_MASK]; final3
2481 paddw_r2r(mm1
, mm7
); // tmp4
2484 paddw_r2r(mm1
, mm5
); // tmp2+tmp5
2485 psubw_r2r(mm1
, mm3
); // tmp2-tmp5
2487 psraw_i2r(3, mm5
); // outptr[0,2],[1,2],[2,2],[3,2]
2489 movq_m2r(*(wsptr
+3), mm4
); // tmp3,final4
2490 psraw_i2r(3, mm3
); // outptr[0,5],[1,5],[2,5],[3,5]
2494 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2496 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2497 // & RANGE_MASK]; final4
2499 paddw_r2r(mm7
, mm4
); // tmp3+tmp4
2501 psubw_r2r(mm7
, mm6
); // tmp3-tmp4
2502 psraw_i2r(3, mm4
); // outptr[0,4],[1,4],[2,4],[3,4]
2504 psraw_i2r(3, mm6
); // outptr[0,3],[1,3],[2,3],[3,3]
2507 movq_r2m(mm4, *dummy);
2508 fprintf(stderr, "3-4 %016llx\n", dummy);
2509 movq_r2m(mm4, *dummy);
2510 fprintf(stderr, "3+4 %016llx\n", dummy);
2514 packuswb_r2r(mm4
, mm5
); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2516 packuswb_r2r(mm3
, mm6
); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2520 punpcklbw_r2r(mm0
, mm2
); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2522 punpckhbw_r2r(mm0
, mm4
); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2525 punpcklbw_r2r(mm6
, mm5
); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2527 punpckhbw_r2r(mm6
, mm7
); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2529 punpcklwd_r2r(mm5
, mm2
); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2532 punpckhwd_r2r(mm5
, mm1
); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2535 punpcklwd_r2r(mm4
, mm6
); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2537 punpckldq_r2r(mm6
, mm2
); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2541 punpckhwd_r2r(mm4
, mm7
); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2544 movq_r2m(mm2
, *(dataptr
));
2546 punpckhdq_r2r(mm6
, mm0
); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2549 movq_r2m(mm0
, *(dataptr
));
2551 punpckldq_r2r(mm7
, mm1
); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2553 punpckhdq_r2r(mm7
, mm3
); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2556 movq_r2m(mm1
, *(dataptr
));
2559 movq_r2m(mm3
, *(dataptr
));
2562 __s32 tmp0
, tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
;
2563 __s32 tmp10
, tmp11
, tmp12
, tmp13
;
2564 __s32 z5
, z10
, z11
, z12
, z13
;
2570 __s32 workspace
[64];
2574 for (ctr
= 8; ctr
> 0; ctr
--) {
2576 if ((inptr
[8] | inptr
[16] | inptr
[24] |
2577 inptr
[32] | inptr
[40] | inptr
[48] | inptr
[56]) == 0) {
2598 tmp10
= tmp0
+ tmp2
;
2599 tmp11
= tmp0
- tmp2
;
2601 tmp13
= tmp1
+ tmp3
;
2602 tmp12
= MULTIPLY(tmp1
- tmp3
, FIX_1_414213562
) - tmp13
;
2604 tmp0
= tmp10
+ tmp13
;
2605 tmp3
= tmp10
- tmp13
;
2606 tmp1
= tmp11
+ tmp12
;
2607 tmp2
= tmp11
- tmp12
;
2620 tmp11
= MULTIPLY(z11
- z13
, FIX_1_414213562
);
2622 z5
= MULTIPLY(z10
+ z12
, FIX_1_847759065
);
2623 tmp10
= MULTIPLY(z12
, FIX_1_082392200
) - z5
;
2624 tmp12
= MULTIPLY(z10
, - FIX_2_613125930
) + z5
;
2626 tmp6
= tmp12
- tmp7
;
2627 tmp5
= tmp11
- tmp6
;
2628 tmp4
= tmp10
+ tmp5
;
2630 wsptr
[0] = (__s32
) (tmp0
+ tmp7
);
2631 wsptr
[56] = (__s32
) (tmp0
- tmp7
);
2632 wsptr
[8] = (__s32
) (tmp1
+ tmp6
);
2633 wsptr
[48] = (__s32
) (tmp1
- tmp6
);
2634 wsptr
[16] = (__s32
) (tmp2
+ tmp5
);
2635 wsptr
[40] = (__s32
) (tmp2
- tmp5
);
2636 wsptr
[32] = (__s32
) (tmp3
+ tmp4
);
2637 wsptr
[24] = (__s32
) (tmp3
- tmp4
);
2644 for (ctr
= 0; ctr
< 8; ctr
++) {
2645 outptr
= &(odata
[ctr
*rskip
]);
2647 tmp10
= wsptr
[0] + wsptr
[4];
2648 tmp11
= wsptr
[0] - wsptr
[4];
2650 tmp13
= wsptr
[2] + wsptr
[6];
2651 tmp12
= MULTIPLY(wsptr
[2] - wsptr
[6], FIX_1_414213562
) - tmp13
;
2653 tmp0
= tmp10
+ tmp13
;
2654 tmp3
= tmp10
- tmp13
;
2655 tmp1
= tmp11
+ tmp12
;
2656 tmp2
= tmp11
- tmp12
;
2658 z13
= wsptr
[5] + wsptr
[3];
2659 z10
= wsptr
[5] - wsptr
[3];
2660 z11
= wsptr
[1] + wsptr
[7];
2661 z12
= wsptr
[1] - wsptr
[7];
2664 tmp11
= MULTIPLY(z11
- z13
, FIX_1_414213562
);
2666 z5
= MULTIPLY(z10
+ z12
, FIX_1_847759065
);
2667 tmp10
= MULTIPLY(z12
, FIX_1_082392200
) - z5
;
2668 tmp12
= MULTIPLY(z10
, - FIX_2_613125930
) + z5
;
2670 tmp6
= tmp12
- tmp7
;
2671 tmp5
= tmp11
- tmp6
;
2672 tmp4
= tmp10
+ tmp5
;
2674 outptr
[0] = RL(DESCALE(tmp0
+ tmp7
));
2675 outptr
[7] = RL(DESCALE(tmp0
- tmp7
));
2676 outptr
[1] = RL(DESCALE(tmp1
+ tmp6
));
2677 outptr
[6] = RL(DESCALE(tmp1
- tmp6
));
2678 outptr
[2] = RL(DESCALE(tmp2
+ tmp5
));
2679 outptr
[5] = RL(DESCALE(tmp2
- tmp5
));
2680 outptr
[4] = RL(DESCALE(tmp3
+ tmp4
));
2681 outptr
[3] = RL(DESCALE(tmp3
- tmp4
));
2691 This file contains most of the initialisation and control functions
2693 (C) Justin Schoeman 1998
2701 Initialise all the cache-aliged data blocks
2705 void RTjpeg_init_data(void)
2709 dptr
=(unsigned long)&(RTjpeg_alldata
[0]);
2712 dptr
=dptr
<<5; /* cache align data */
2714 RTjpeg_block
=(__s16
*)dptr
;
2715 dptr
+=sizeof(__s16
)*64;
2716 RTjpeg_lqt
=(__s32
*)dptr
;
2717 dptr
+=sizeof(__s32
)*64;
2718 RTjpeg_cqt
=(__s32
*)dptr
;
2719 dptr
+=sizeof(__s32
)*64;
2720 RTjpeg_liqt
=(__u32
*)dptr
;
2721 dptr
+=sizeof(__u32
)*64;
2722 RTjpeg_ciqt
=(__u32
*)dptr
;
2729 Re-set quality factor
2731 Input: buf -> pointer to 128 ints for quant values store to pass back to
2733 Q -> quality factor (192=best, 32=worst)
2736 void RTjpeg_init_Q(__u8 Q
)
2741 qual
=(__u64
)Q
<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2745 RTjpeg_lqt
[i
]=(__s32
)((qual
/((__u64
)RTjpeg_lum_quant_tbl
[i
]<<16))>>3);
2746 if(RTjpeg_lqt
[i
]==0)RTjpeg_lqt
[i
]=1;
2747 RTjpeg_cqt
[i
]=(__s32
)((qual
/((__u64
)RTjpeg_chrom_quant_tbl
[i
]<<16))>>3);
2748 if(RTjpeg_cqt
[i
]==0)RTjpeg_cqt
[i
]=1;
2749 RTjpeg_liqt
[i
]=(1<<16)/(RTjpeg_lqt
[i
]<<3);
2750 RTjpeg_ciqt
[i
]=(1<<16)/(RTjpeg_cqt
[i
]<<3);
2751 RTjpeg_lqt
[i
]=((1<<16)/RTjpeg_liqt
[i
])>>3;
2752 RTjpeg_cqt
[i
]=((1<<16)/RTjpeg_ciqt
[i
])>>3;
2756 while(RTjpeg_liqt
[RTjpeg_ZZ
[++RTjpeg_lb8
]]<=8);
2759 while(RTjpeg_ciqt
[RTjpeg_ZZ
[++RTjpeg_cb8
]]<=8);
2764 RTjpeg_quant_init();
2771 Initialise compression.
2773 Input: buf -> pointer to 128 ints for quant values store to pass back to
2775 width -> width of image
2776 height -> height of image
2777 Q -> quality factor (192=best, 32=worst)
2781 void RTjpeg_init_compress(__u32
*buf
, int width
, int height
, __u8 Q
)
2789 RTjpeg_height
=height
;
2790 RTjpeg_Ywidth
= RTjpeg_width
>>3;
2791 RTjpeg_Ysize
=width
* height
;
2792 RTjpeg_Cwidth
= RTjpeg_width
>>4;
2793 RTjpeg_Csize
= (width
>>1) * height
;
2795 qual
=(__u64
)Q
<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2799 RTjpeg_lqt
[i
]=(__s32
)((qual
/((__u64
)RTjpeg_lum_quant_tbl
[i
]<<16))>>3);
2800 if(RTjpeg_lqt
[i
]==0)RTjpeg_lqt
[i
]=1;
2801 RTjpeg_cqt
[i
]=(__s32
)((qual
/((__u64
)RTjpeg_chrom_quant_tbl
[i
]<<16))>>3);
2802 if(RTjpeg_cqt
[i
]==0)RTjpeg_cqt
[i
]=1;
2803 RTjpeg_liqt
[i
]=(1<<16)/(RTjpeg_lqt
[i
]<<3);
2804 RTjpeg_ciqt
[i
]=(1<<16)/(RTjpeg_cqt
[i
]<<3);
2805 RTjpeg_lqt
[i
]=((1<<16)/RTjpeg_liqt
[i
])>>3;
2806 RTjpeg_cqt
[i
]=((1<<16)/RTjpeg_ciqt
[i
])>>3;
2810 while(RTjpeg_liqt
[RTjpeg_ZZ
[++RTjpeg_lb8
]]<=8);
2813 while(RTjpeg_ciqt
[RTjpeg_ZZ
[++RTjpeg_cb8
]]<=8);
2817 RTjpeg_quant_init();
2820 buf
[i
]=le2me_32(RTjpeg_liqt
[i
]);
2822 buf
[64+i
]=le2me_32(RTjpeg_ciqt
[i
]);
2825 void RTjpeg_init_decompress(__u32
*buf
, int width
, int height
)
2832 RTjpeg_height
=height
;
2833 RTjpeg_Ywidth
= RTjpeg_width
>>3;
2834 RTjpeg_Ysize
=width
* height
;
2835 RTjpeg_Cwidth
= RTjpeg_width
>>4;
2836 RTjpeg_Csize
= (width
>>1) * height
;
2840 RTjpeg_liqt
[i
]=le2me_32(buf
[i
]);
2841 RTjpeg_ciqt
[i
]=le2me_32(buf
[i
+64]);
2845 while(RTjpeg_liqt
[RTjpeg_ZZ
[++RTjpeg_lb8
]]<=8);
2848 while(RTjpeg_ciqt
[RTjpeg_ZZ
[++RTjpeg_cb8
]]<=8);
2853 // RTjpeg_color_init();
2856 int RTjpeg_compressYUV420(__s8
*sp
, unsigned char *bp
)
2859 register __s8
* bp1
= bp
+ (RTjpeg_width
<<3);
2860 register __s8
* bp2
= bp
+ RTjpeg_Ysize
;
2861 register __s8
* bp3
= bp2
+ (RTjpeg_Csize
>>1);
2862 register int i
, j
, k
;
2869 for(i
=RTjpeg_height
>>1; i
; i
-=8)
2871 for(j
=0, k
=0; j
<RTjpeg_width
; j
+=16, k
+=8)
2873 RTjpeg_dctY(bp
+j
, RTjpeg_block
, RTjpeg_Ywidth
);
2874 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
2875 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
2877 RTjpeg_dctY(bp
+j
+8, RTjpeg_block
, RTjpeg_Ywidth
);
2878 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
2879 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
2881 RTjpeg_dctY(bp1
+j
, RTjpeg_block
, RTjpeg_Ywidth
);
2882 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
2883 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
2885 RTjpeg_dctY(bp1
+j
+8, RTjpeg_block
, RTjpeg_Ywidth
);
2886 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
2887 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
2889 RTjpeg_dctY(bp2
+k
, RTjpeg_block
, RTjpeg_Cwidth
);
2890 RTjpeg_quant(RTjpeg_block
, RTjpeg_cqt
);
2891 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_cb8
);
2893 RTjpeg_dctY(bp3
+k
, RTjpeg_block
, RTjpeg_Cwidth
);
2894 RTjpeg_quant(RTjpeg_block
, RTjpeg_cqt
);
2895 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_cb8
);
2898 bp
+=RTjpeg_width
<<4;
2899 bp1
+=RTjpeg_width
<<4;
2900 bp2
+=RTjpeg_width
<<2;
2901 bp3
+=RTjpeg_width
<<2;
2910 int RTjpeg_compressYUV422(__s8
*sp
, unsigned char *bp
)
2913 register __s8
* bp2
= bp
+ RTjpeg_Ysize
;
2914 register __s8
* bp3
= bp2
+ RTjpeg_Csize
;
2915 register int i
, j
, k
;
2922 for(i
=RTjpeg_height
; i
; i
-=8)
2924 for(j
=0, k
=0; j
<RTjpeg_width
; j
+=16, k
+=8)
2926 RTjpeg_dctY(bp
+j
, RTjpeg_block
, RTjpeg_Ywidth
);
2927 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
2928 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
2930 RTjpeg_dctY(bp
+j
+8, RTjpeg_block
, RTjpeg_Ywidth
);
2931 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
2932 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
2934 RTjpeg_dctY(bp2
+k
, RTjpeg_block
, RTjpeg_Cwidth
);
2935 RTjpeg_quant(RTjpeg_block
, RTjpeg_cqt
);
2936 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_cb8
);
2938 RTjpeg_dctY(bp3
+k
, RTjpeg_block
, RTjpeg_Cwidth
);
2939 RTjpeg_quant(RTjpeg_block
, RTjpeg_cqt
);
2940 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_cb8
);
2943 bp
+=RTjpeg_width
<<3;
2944 bp2
+=RTjpeg_width
<<2;
2945 bp3
+=RTjpeg_width
<<2;
2954 int RTjpeg_compress8(__s8
*sp
, unsigned char *bp
)
2965 for(i
=0; i
<RTjpeg_height
; i
+=8)
2967 for(j
=0; j
<RTjpeg_width
; j
+=8)
2969 RTjpeg_dctY(bp
+j
, RTjpeg_block
, RTjpeg_width
);
2970 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
2971 sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
2982 void RTjpeg_decompressYUV422(__s8
*sp
, __u8
*bp
)
2984 register __s8
* bp2
= bp
+ RTjpeg_Ysize
;
2985 register __s8
* bp3
= bp2
+ (RTjpeg_Csize
);
2993 for(i
=RTjpeg_height
; i
; i
-=8)
2995 for(k
=0, j
=0; j
<RTjpeg_width
; j
+=16, k
+=8) {
2999 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_lb8
, RTjpeg_liqt
);
3000 RTjpeg_idct(bp
+j
, RTjpeg_block
, RTjpeg_width
);
3005 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_lb8
, RTjpeg_liqt
);
3006 RTjpeg_idct(bp
+j
+8, RTjpeg_block
, RTjpeg_width
);
3011 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_cb8
, RTjpeg_ciqt
);
3012 RTjpeg_idct(bp2
+k
, RTjpeg_block
, RTjpeg_width
>>1);
3017 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_cb8
, RTjpeg_ciqt
);
3018 RTjpeg_idct(bp3
+k
, RTjpeg_block
, RTjpeg_width
>>1);
3021 bp
+=RTjpeg_width
<<3;
3022 bp2
+=RTjpeg_width
<<2;
3023 bp3
+=RTjpeg_width
<<2;
3030 void RTjpeg_decompressYUV420(__s8
*sp
, __u8
*bp
)
3032 register __s8
* bp1
= bp
+ (RTjpeg_width
<<3);
3033 register __s8
* bp2
= bp
+ RTjpeg_Ysize
;
3034 register __s8
* bp3
= bp2
+ (RTjpeg_Csize
>>1);
3042 for(i
=RTjpeg_height
>>1; i
; i
-=8)
3044 for(k
=0, j
=0; j
<RTjpeg_width
; j
+=16, k
+=8) {
3048 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_lb8
, RTjpeg_liqt
);
3049 RTjpeg_idct(bp
+j
, RTjpeg_block
, RTjpeg_width
);
3054 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_lb8
, RTjpeg_liqt
);
3055 RTjpeg_idct(bp
+j
+8, RTjpeg_block
, RTjpeg_width
);
3060 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_lb8
, RTjpeg_liqt
);
3061 RTjpeg_idct(bp1
+j
, RTjpeg_block
, RTjpeg_width
);
3066 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_lb8
, RTjpeg_liqt
);
3067 RTjpeg_idct(bp1
+j
+8, RTjpeg_block
, RTjpeg_width
);
3072 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_cb8
, RTjpeg_ciqt
);
3073 RTjpeg_idct(bp2
+k
, RTjpeg_block
, RTjpeg_width
>>1);
3078 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_cb8
, RTjpeg_ciqt
);
3079 RTjpeg_idct(bp3
+k
, RTjpeg_block
, RTjpeg_width
>>1);
3082 bp
+=RTjpeg_width
<<4;
3083 bp1
+=RTjpeg_width
<<4;
3084 bp2
+=RTjpeg_width
<<2;
3085 bp3
+=RTjpeg_width
<<2;
3092 void RTjpeg_decompress8(__s8
*sp
, __u8
*bp
)
3101 for(i
=0; i
<RTjpeg_height
; i
+=8)
3103 for(j
=0; j
<RTjpeg_width
; j
+=8)
3107 sp
+=RTjpeg_s2b(RTjpeg_block
, sp
, RTjpeg_lb8
, RTjpeg_liqt
);
3108 RTjpeg_idct(bp
+j
, RTjpeg_block
, RTjpeg_width
);
3110 bp
+=RTjpeg_width
<<3;
3117 Initialise additional data structures for motion compensation
3121 void RTjpeg_init_mcompress(void)
3127 RTjpeg_old
=malloc((4*RTjpeg_width
*RTjpeg_height
)+32);
3128 tmp
=(unsigned long)RTjpeg_old
;
3131 RTjpeg_old
=(__s16
*)(tmp
<<5);
3135 fprintf(stderr
, "RTjpeg: Could not allocate memory\n");
3138 memset(RTjpeg_old
, 0, ((4*RTjpeg_width
*RTjpeg_height
)));
3143 int RTjpeg_bcomp(__s16
*old
, mmx_t
*mask
)
3146 mmx_t
*mold
=(mmx_t
*)old
;
3147 mmx_t
*mblock
=(mmx_t
*)RTjpeg_block
;
3148 volatile mmx_t result
;
3149 static mmx_t neg
={0xffffffffffffffffULL
};
3151 movq_m2r(*mask
, mm7
);
3157 movq_m2r(*(mblock
++), mm0
);
3158 movq_m2r(*(mblock
++), mm2
);
3159 movq_m2r(*(mold
++), mm1
);
3160 movq_m2r(*(mold
++), mm3
);
3161 psubsw_r2r(mm1
, mm0
);
3162 psubsw_r2r(mm3
, mm2
);
3165 pcmpgtw_r2r(mm7
, mm0
);
3166 pcmpgtw_r2r(mm7
, mm2
);
3169 pcmpgtw_r2r(mm7
, mm1
);
3170 pcmpgtw_r2r(mm7
, mm3
);
3176 movq_r2m(mm5
, result
);
3180 // if(!RTjpeg_mtest)
3181 // for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
3189 int RTjpeg_bcomp(__s16
*old
, __u16
*mask
)
3194 if(abs(old
[i
]-RTjpeg_block
[i
])>*mask
)
3197 for(i
=0; i
<16; i
++)((__u64
*)old
)[i
]=((__u64
*)RTjpeg_block
)[i
];
3204 void RTjpeg_set_test(int i
)
3209 int RTjpeg_mcompressYUV420(__s8
*sp
, unsigned char *bp
, __u16 lmask
, __u16 cmask
)
3213 register __s8
* bp1
= bp
+ (RTjpeg_width
<<3);
3214 register __s8
* bp2
= bp
+ RTjpeg_Ysize
;
3215 register __s8
* bp3
= bp2
+ (RTjpeg_Csize
>>1);
3216 register int i
, j
, k
;
3220 RTjpeg_lmask
.uq
=((__u64
)lmask
<<48)|((__u64
)lmask
<<32)|((__u64
)lmask
<<16)|lmask
;
3221 RTjpeg_cmask
.uq
=((__u64
)cmask
<<48)|((__u64
)cmask
<<32)|((__u64
)cmask
<<16)|cmask
;
3230 for(i
=RTjpeg_height
>>1; i
; i
-=8)
3232 for(j
=0, k
=0; j
<RTjpeg_width
; j
+=16, k
+=8)
3234 RTjpeg_dctY(bp
+j
, RTjpeg_block
, RTjpeg_Ywidth
);
3235 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
3236 if(RTjpeg_bcomp(block
, &RTjpeg_lmask
))
3238 *((__u8
*)sp
++)=255;
3240 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
3243 RTjpeg_dctY(bp
+j
+8, RTjpeg_block
, RTjpeg_Ywidth
);
3244 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
3245 if(RTjpeg_bcomp(block
, &RTjpeg_lmask
))
3247 *((__u8
*)sp
++)=255;
3249 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
3252 RTjpeg_dctY(bp1
+j
, RTjpeg_block
, RTjpeg_Ywidth
);
3253 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
3254 if(RTjpeg_bcomp(block
, &RTjpeg_lmask
))
3256 *((__u8
*)sp
++)=255;
3258 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
3261 RTjpeg_dctY(bp1
+j
+8, RTjpeg_block
, RTjpeg_Ywidth
);
3262 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
3263 if(RTjpeg_bcomp(block
, &RTjpeg_lmask
))
3265 *((__u8
*)sp
++)=255;
3267 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
3270 RTjpeg_dctY(bp2
+k
, RTjpeg_block
, RTjpeg_Cwidth
);
3271 RTjpeg_quant(RTjpeg_block
, RTjpeg_cqt
);
3272 if(RTjpeg_bcomp(block
, &RTjpeg_cmask
))
3274 *((__u8
*)sp
++)=255;
3276 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_cb8
);
3279 RTjpeg_dctY(bp3
+k
, RTjpeg_block
, RTjpeg_Cwidth
);
3280 RTjpeg_quant(RTjpeg_block
, RTjpeg_cqt
);
3281 if(RTjpeg_bcomp(block
, &RTjpeg_cmask
))
3283 *((__u8
*)sp
++)=255;
3285 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_cb8
);
3288 bp
+=RTjpeg_width
<<4;
3289 bp1
+=RTjpeg_width
<<4;
3290 bp2
+=RTjpeg_width
<<2;
3291 bp3
+=RTjpeg_width
<<2;
3301 int RTjpeg_mcompressYUV422(__s8
*sp
, unsigned char *bp
, __u16 lmask
, __u16 cmask
)
3305 register __s8
* bp2
;
3306 register __s8
* bp3
;
3307 register int i
, j
, k
;
3311 RTjpeg_lmask
.uq
=((__u64
)lmask
<<48)|((__u64
)lmask
<<32)|((__u64
)lmask
<<16)|lmask
;
3312 RTjpeg_cmask
.uq
=((__u64
)cmask
<<48)|((__u64
)cmask
<<32)|((__u64
)cmask
<<16)|cmask
;
3318 bp
= bp
- RTjpeg_width
*0;
3319 bp2
= bp
+ RTjpeg_Ysize
-RTjpeg_width
*0;
3320 bp3
= bp2
+ RTjpeg_Csize
;
3325 for(i
=RTjpeg_height
; i
; i
-=8)
3327 for(j
=0, k
=0; j
<RTjpeg_width
; j
+=16, k
+=8)
3329 RTjpeg_dctY(bp
+j
, RTjpeg_block
, RTjpeg_Ywidth
);
3330 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
3331 if(RTjpeg_bcomp(block
, &RTjpeg_lmask
))
3333 *((__u8
*)sp
++)=255;
3335 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
3338 RTjpeg_dctY(bp
+j
+8, RTjpeg_block
, RTjpeg_Ywidth
);
3339 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
3340 if(RTjpeg_bcomp(block
, &RTjpeg_lmask
))
3342 *((__u8
*)sp
++)=255;
3344 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
3347 RTjpeg_dctY(bp2
+k
, RTjpeg_block
, RTjpeg_Cwidth
);
3348 RTjpeg_quant(RTjpeg_block
, RTjpeg_cqt
);
3349 if(RTjpeg_bcomp(block
, &RTjpeg_cmask
))
3351 *((__u8
*)sp
++)=255;
3353 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_cb8
);
3356 RTjpeg_dctY(bp3
+k
, RTjpeg_block
, RTjpeg_Cwidth
);
3357 RTjpeg_quant(RTjpeg_block
, RTjpeg_cqt
);
3358 if(RTjpeg_bcomp(block
, &RTjpeg_cmask
))
3360 *((__u8
*)sp
++)=255;
3362 else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_cb8
);
3366 bp
+=RTjpeg_width
<<3;
3367 bp2
+=RTjpeg_width
<<2;
3368 bp3
+=RTjpeg_width
<<2;
3370 printf ("%d\n", block
- RTjpeg_old
);
3377 int RTjpeg_mcompress8(__s8
*sp
, unsigned char *bp
, __u16 lmask
)
3385 RTjpeg_lmask
.uq
=((__u64
)lmask
<<48)|((__u64
)lmask
<<32)|((__u64
)lmask
<<16)|lmask
;
3394 for(i
=0; i
<RTjpeg_height
; i
+=8)
3396 for(j
=0; j
<RTjpeg_width
; j
+=8)
3398 RTjpeg_dctY(bp
+j
, RTjpeg_block
, RTjpeg_width
);
3399 RTjpeg_quant(RTjpeg_block
, RTjpeg_lqt
);
3400 if(RTjpeg_bcomp(block
, &RTjpeg_lmask
))
3402 *((__u8
*)sp
++)=255;
3403 // printf("* %d ", sp[-1]);
3404 } else sp
+=RTjpeg_b2s(RTjpeg_block
, sp
, RTjpeg_lb8
);
3407 bp
+=RTjpeg_width
<<3;
3415 void RTjpeg_color_init(void)
3425 void RTjpeg_yuv422rgb(__u8
*buf
, __u8
*rgb
, int stride
)
3429 __s32 y
, crR
, crG
, cbG
, cbB
;
3430 __u8
*bufcr
, *bufcb
, *bufy
, *bufoute
;
3435 bufcb
=&buf
[RTjpeg_width
*RTjpeg_height
];
3436 bufcr
=&buf
[RTjpeg_width
*RTjpeg_height
+(RTjpeg_width
*RTjpeg_height
)/2];
3440 for(i
=0; i
<(RTjpeg_height
); i
++)
3442 for(j
=0; j
<RTjpeg_width
; j
+=2)
3444 crR
=(*bufcr
-128)*KcrR
;
3445 crG
=(*(bufcr
++)-128)*KcrG
;
3446 cbG
=(*bufcb
-128)*KcbG
;
3447 cbB
=(*(bufcb
++)-128)*KcbB
;
3452 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3453 tmp
=(y
-crG
-cbG
)>>16;
3454 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3456 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3458 y
=(bufy
[j
+1]-16)*Ky
;
3461 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3462 tmp
=(y
-crG
-cbG
)>>16;
3463 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3465 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3473 void RTjpeg_yuv420rgb(__u8
*buf
, __u8
*rgb
, int stride
)
3477 __s32 y
, crR
, crG
, cbG
, cbB
;
3478 __u8
*bufcr
, *bufcb
, *bufy
, *bufoute
, *bufouto
;
3482 oskip
=RTjpeg_width
*3;
3484 oskip
=2*stride
-RTjpeg_width
*3;
3488 bufcb
=&buf
[RTjpeg_width
*RTjpeg_height
];
3489 bufcr
=&buf
[RTjpeg_width
*RTjpeg_height
+(RTjpeg_width
*RTjpeg_height
)/4];
3492 bufouto
=rgb
+RTjpeg_width
*3;
3494 for(i
=0; i
<(RTjpeg_height
>>1); i
++)
3496 for(j
=0; j
<RTjpeg_width
; j
+=2)
3498 crR
=(*bufcr
-128)*KcrR
;
3499 crG
=(*(bufcr
++)-128)*KcrG
;
3500 cbG
=(*bufcb
-128)*KcbG
;
3501 cbB
=(*(bufcb
++)-128)*KcbB
;
3506 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3507 tmp
=(y
-crG
-cbG
)>>16;
3508 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3510 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3512 y
=(bufy
[j
+1]-16)*Ky
;
3515 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3516 tmp
=(y
-crG
-cbG
)>>16;
3517 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3519 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3521 y
=(bufy
[j
+yskip
]-16)*Ky
;
3524 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3525 tmp
=(y
-crG
-cbG
)>>16;
3526 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3528 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3530 y
=(bufy
[j
+1+yskip
]-16)*Ky
;
3533 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3534 tmp
=(y
-crG
-cbG
)>>16;
3535 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3537 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3547 void RTjpeg_yuvrgb32(__u8
*buf
, __u8
*rgb
, int stride
)
3551 __s32 y
, crR
, crG
, cbG
, cbB
;
3552 __u8
*bufcr
, *bufcb
, *bufy
, *bufoute
, *bufouto
;
3556 oskip
=RTjpeg_width
*4;
3558 oskip
= 2*stride
-RTjpeg_width
*4;
3561 bufcb
=&buf
[RTjpeg_width
*RTjpeg_height
];
3562 bufcr
=&buf
[RTjpeg_width
*RTjpeg_height
+(RTjpeg_width
*RTjpeg_height
)/2];
3565 bufouto
=rgb
+RTjpeg_width
*4;
3567 for(i
=0; i
<(RTjpeg_height
>>1); i
++)
3569 for(j
=0; j
<RTjpeg_width
; j
+=2)
3571 crR
=(*bufcr
-128)*KcrR
;
3572 crG
=(*(bufcr
++)-128)*KcrG
;
3573 cbG
=(*bufcb
-128)*KcbG
;
3574 cbB
=(*(bufcb
++)-128)*KcbB
;
3579 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3580 tmp
=(y
-crG
-cbG
)>>16;
3581 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3583 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3586 y
=(bufy
[j
+1]-16)*Ky
;
3589 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3590 tmp
=(y
-crG
-cbG
)>>16;
3591 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3593 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3596 y
=(bufy
[j
+yskip
]-16)*Ky
;
3599 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3600 tmp
=(y
-crG
-cbG
)>>16;
3601 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3603 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3606 y
=(bufy
[j
+1+yskip
]-16)*Ky
;
3609 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3610 tmp
=(y
-crG
-cbG
)>>16;
3611 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3613 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3623 void RTjpeg_yuvrgb24(__u8
*buf
, __u8
*rgb
, int stride
)
3627 __s32 y
, crR
, crG
, cbG
, cbB
;
3628 __u8
*bufcr
, *bufcb
, *bufy
, *bufoute
, *bufouto
;
3632 oskip
=RTjpeg_width
*3;
3634 oskip
=2*stride
- RTjpeg_width
*3;
3638 bufcb
=&buf
[RTjpeg_width
*RTjpeg_height
];
3639 bufcr
=&buf
[RTjpeg_width
*RTjpeg_height
+(RTjpeg_width
*RTjpeg_height
)/4];
3642 bufouto
=rgb
+RTjpeg_width
*3;
3644 for(i
=0; i
<(RTjpeg_height
>>1); i
++)
3646 for(j
=0; j
<RTjpeg_width
; j
+=2)
3648 crR
=(*bufcr
-128)*KcrR
;
3649 crG
=(*(bufcr
++)-128)*KcrG
;
3650 cbG
=(*bufcb
-128)*KcbG
;
3651 cbB
=(*(bufcb
++)-128)*KcbB
;
3656 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3657 tmp
=(y
-crG
-cbG
)>>16;
3658 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3660 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3662 y
=(bufy
[j
+1]-16)*Ky
;
3665 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3666 tmp
=(y
-crG
-cbG
)>>16;
3667 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3669 *(bufoute
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3671 y
=(bufy
[j
+yskip
]-16)*Ky
;
3674 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3675 tmp
=(y
-crG
-cbG
)>>16;
3676 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3678 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3680 y
=(bufy
[j
+1+yskip
]-16)*Ky
;
3683 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3684 tmp
=(y
-crG
-cbG
)>>16;
3685 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3687 *(bufouto
++)=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3696 void RTjpeg_yuvrgb16(__u8
*buf
, __u8
*rgb
, int stride
)
3700 __s32 y
, crR
, crG
, cbG
, cbB
;
3701 __u8
*bufcr
, *bufcb
, *bufy
, *bufoute
, *bufouto
;
3703 unsigned char r
, g
, b
;
3706 oskip
=RTjpeg_width
*2;
3708 oskip
=2*stride
-RTjpeg_width
*2;
3712 bufcb
=&buf
[RTjpeg_width
*RTjpeg_height
];
3713 bufcr
=&buf
[RTjpeg_width
*RTjpeg_height
+(RTjpeg_width
*RTjpeg_height
)/4];
3716 bufouto
=rgb
+RTjpeg_width
*2;
3718 for(i
=0; i
<(RTjpeg_height
>>1); i
++)
3720 for(j
=0; j
<RTjpeg_width
; j
+=2)
3722 crR
=(*bufcr
-128)*KcrR
;
3723 crG
=(*(bufcr
++)-128)*KcrG
;
3724 cbG
=(*bufcb
-128)*KcbG
;
3725 cbB
=(*(bufcb
++)-128)*KcbB
;
3730 b
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3731 tmp
=(y
-crG
-cbG
)>>16;
3732 g
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3734 r
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3735 tmp
=(int)((int)b
>> 3);
3736 tmp
|=(int)(((int)g
>> 2) << 5);
3737 tmp
|=(int)(((int)r
>> 3) << 11);
3738 *(bufoute
++)=tmp
&0xff;
3739 *(bufoute
++)=tmp
>>8;
3742 y
=(bufy
[j
+1]-16)*Ky
;
3745 b
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3746 tmp
=(y
-crG
-cbG
)>>16;
3747 g
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3749 r
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3750 tmp
=(int)((int)b
>> 3);
3751 tmp
|=(int)(((int)g
>> 2) << 5);
3752 tmp
|=(int)(((int)r
>> 3) << 11);
3753 *(bufoute
++)=tmp
&0xff;
3754 *(bufoute
++)=tmp
>>8;
3756 y
=(bufy
[j
+yskip
]-16)*Ky
;
3759 b
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3760 tmp
=(y
-crG
-cbG
)>>16;
3761 g
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3763 r
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3764 tmp
=(int)((int)b
>> 3);
3765 tmp
|=(int)(((int)g
>> 2) << 5);
3766 tmp
|=(int)(((int)r
>> 3) << 11);
3767 *(bufouto
++)=tmp
&0xff;
3768 *(bufouto
++)=tmp
>>8;
3770 y
=(bufy
[j
+1+yskip
]-16)*Ky
;
3773 b
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3774 tmp
=(y
-crG
-cbG
)>>16;
3775 g
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3777 r
=(tmp
>255)?255:((tmp
<0)?0:tmp
);
3778 tmp
=(int)((int)b
>> 3);
3779 tmp
|=(int)(((int)g
>> 2) << 5);
3780 tmp
|=(int)(((int)r
>> 3) << 11);
3781 *(bufouto
++)=tmp
&0xff;
3782 *(bufouto
++)=tmp
>>8;
3793 void RTjpeg_yuvrgb8(__u8
*buf
, __u8
*rgb
, int stride
)
3795 memcpy(rgb
, buf
, RTjpeg_width
*RTjpeg_height
);