stream/tv: move new_handle() function from header to tv.c
[mplayer.git] / libmpcodecs / native / rtjpegn.c
blob66089b0ecf191dd4df062b66abe9ef09fdc6c347
1 /*
2 RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
4 With modifications by:
5 (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
6 and
7 (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #include "config.h"
30 #include "mpbswap.h"
31 #include "rtjpegn.h"
33 #if HAVE_MMX
34 #include "mmx.h"
35 #endif
37 //#define SHOWBLOCK 1
38 #define BETTERCOMPRESSION 1
40 static const unsigned char RTjpeg_ZZ[64]={
42 8, 1,
43 2, 9, 16,
44 24, 17, 10, 3,
45 4, 11, 18, 25, 32,
46 40, 33, 26, 19, 12, 5,
47 6, 13, 20, 27, 34, 41, 48,
48 56, 49, 42, 35, 28, 21, 14, 7,
49 15, 22, 29, 36, 43, 50, 57,
50 58, 51, 44, 37, 30, 23,
51 31, 38, 45, 52, 59,
52 60, 53, 46, 39,
53 47, 54, 61,
54 62, 55,
55 63 };
57 static const __u64 RTjpeg_aan_tab[64]={
58 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
59 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
60 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
61 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
62 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
63 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
64 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
65 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
68 #if !HAVE_MMX
69 static __s32 RTjpeg_ws[64+31];
70 #endif
71 static __u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32];
73 static __s16 *block; // rh
74 static __s16 *RTjpeg_block;
75 static __s32 *RTjpeg_lqt;
76 static __s32 *RTjpeg_cqt;
77 static __u32 *RTjpeg_liqt;
78 static __u32 *RTjpeg_ciqt;
80 static unsigned char RTjpeg_lb8;
81 static unsigned char RTjpeg_cb8;
82 static int RTjpeg_width, RTjpeg_height;
83 static int RTjpeg_Ywidth, RTjpeg_Cwidth;
84 static int RTjpeg_Ysize, RTjpeg_Csize;
86 static __s16 *RTjpeg_old=NULL;
88 #if HAVE_MMX
89 static mmx_t RTjpeg_lmask;
90 static mmx_t RTjpeg_cmask;
91 #else
92 static __u16 RTjpeg_lmask;
93 static __u16 RTjpeg_cmask;
94 #endif
96 static const unsigned char RTjpeg_lum_quant_tbl[64] = {
97 16, 11, 10, 16, 24, 40, 51, 61,
98 12, 12, 14, 19, 26, 58, 60, 55,
99 14, 13, 16, 24, 40, 57, 69, 56,
100 14, 17, 22, 29, 51, 87, 80, 62,
101 18, 22, 37, 56, 68, 109, 103, 77,
102 24, 35, 55, 64, 81, 104, 113, 92,
103 49, 64, 78, 87, 103, 121, 120, 101,
104 72, 92, 95, 98, 112, 100, 103, 99
107 static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
108 17, 18, 24, 47, 99, 99, 99, 99,
109 18, 21, 26, 66, 99, 99, 99, 99,
110 24, 26, 56, 99, 99, 99, 99, 99,
111 47, 66, 99, 99, 99, 99, 99, 99,
112 99, 99, 99, 99, 99, 99, 99, 99,
113 99, 99, 99, 99, 99, 99, 99, 99,
114 99, 99, 99, 99, 99, 99, 99, 99,
115 99, 99, 99, 99, 99, 99, 99, 99
118 #ifdef BETTERCOMPRESSION
120 /*--------------------------------------------------*/
121 /* better encoding, but needs a lot more cpu time */
122 /* seems to be more effective than old method +lzo */
123 /* with this encoding lzo isn't efficient anymore */
124 /* there is still more potential for better */
125 /* encoding but that would need even more cputime */
126 /* anyway your mileage may vary */
127 /* */
128 /* written by Martin BIELY and Roman HOCHLEITNER */
129 /*--------------------------------------------------*/
131 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
132 /* Block to Stream (encoding) */
133 /* */
135 static int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
137 register int ci, co=1;
138 register __s16 ZZvalue;
139 register unsigned char bitten;
140 register unsigned char bitoff;
142 #ifdef SHOWBLOCK
144 int ii;
145 for (ii=0; ii < 64; ii++) {
146 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
148 fprintf(stdout, "\n\n");
150 #endif
152 // first byte allways written
153 ((__u8*)strm)[0]=
154 (__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
157 ci=63;
158 while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
160 bitten = ((unsigned char)ci) << 2;
162 if (ci==0) {
163 ((__u8*)strm)[1]= bitten;
164 co = 2;
165 return (int)co;
168 /* bitoff=0 because the high 6bit contain first non zero position */
169 bitoff = 0;
170 co = 1;
172 for(; ci>0; ci--) {
174 ZZvalue = data[RTjpeg_ZZ[ci]];
176 switch(ZZvalue) {
177 case 0:
178 break;
179 case 1:
180 bitten |= (0x01<<bitoff);
181 break;
182 case -1:
183 bitten |= (0x03<<bitoff);
184 break;
185 default:
186 bitten |= (0x02<<bitoff);
187 goto HERZWEH;
188 break;
191 if( bitoff == 0 ) {
192 ((__u8*)strm)[co]= bitten;
193 bitten = 0;
194 bitoff = 8;
195 co++;
196 } /* "fall through" */
197 bitoff-=2;
201 /* ci must be 0 */
202 if(bitoff != 6) {
204 ((__u8*)strm)[co]= bitten;
205 co++;
208 goto BAUCHWEH;
210 HERZWEH:
211 /* ci cannot be 0 */
212 /* correct bitoff to nibble boundaries */
214 switch(bitoff){
215 case 4:
216 case 6:
217 bitoff = 0;
218 break;
219 case 2:
220 case 0:
221 ((__u8*)strm)[co]= bitten;
222 bitoff = 4;
223 co++;
224 bitten = 0; // clear half nibble values in bitten
225 break;
226 default:
227 break;
230 for(; ci>0; ci--) {
232 ZZvalue = data[RTjpeg_ZZ[ci]];
234 if( (ZZvalue > 7) || (ZZvalue < -7) ) {
235 bitten |= (0x08<<bitoff);
236 goto HIRNWEH;
239 bitten |= (ZZvalue&0xf)<<bitoff;
241 if( bitoff == 0 ) {
242 ((__u8*)strm)[co]= bitten;
243 bitten = 0;
244 bitoff = 8;
245 co++;
246 } /* "fall thru" */
247 bitoff-=4;
250 /* ci must be 0 */
251 if( bitoff == 0 ) {
252 ((__u8*)strm)[co]= bitten;
253 co++;
255 goto BAUCHWEH;
257 HIRNWEH:
259 ((__u8*)strm)[co]= bitten;
260 co++;
263 /* bitting is over now we bite */
264 for(; ci>0; ci--) {
266 ZZvalue = data[RTjpeg_ZZ[ci]];
268 if(ZZvalue>0)
270 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
272 else
274 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
280 BAUCHWEH:
281 /* we gotoo much now we are ill */
282 #ifdef SHOWBLOCK
284 int i;
285 fprintf(stdout, "\nco = '%d'\n", co);
286 for (i=0; i < co+2; i++) {
287 fprintf(stdout, "%d ", strm[i]);
289 fprintf(stdout, "\n\n");
291 #endif
293 return (int)co;
296 #else
298 static int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
300 register int ci, co=1, tmp;
301 register __s16 ZZvalue;
303 #ifdef SHOWBLOCK
305 int ii;
306 for (ii=0; ii < 64; ii++) {
307 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
309 fprintf(stdout, "\n\n");
311 #endif
313 (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
315 for(ci=1; ci<=bt8; ci++)
317 ZZvalue = data[RTjpeg_ZZ[ci]];
319 if(ZZvalue>0)
321 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
323 else
325 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
329 for(; ci<64; ci++)
331 ZZvalue = data[RTjpeg_ZZ[ci]];
333 if(ZZvalue>0)
335 strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue;
337 else if(ZZvalue<0)
339 strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue;
341 else /* compress zeros */
343 tmp=ci;
346 ci++;
348 while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
350 strm[co++]=(__s8)(63+(ci-tmp));
351 ci--;
354 return (int)co;
357 static int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
359 int ci=1, co=1, tmp;
360 register int i;
362 i=RTjpeg_ZZ[0];
363 data[i]=((__u8)strm[0])*qtbl[i];
365 for(co=1; co<=bt8; co++)
367 i=RTjpeg_ZZ[co];
368 data[i]=strm[ci++]*qtbl[i];
371 for(; co<64; co++)
373 if(strm[ci]>63)
375 tmp=co+strm[ci]-63;
376 for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
377 co--;
378 } else
380 i=RTjpeg_ZZ[co];
381 data[i]=strm[ci]*qtbl[i];
383 ci++;
385 return (int)ci;
387 #endif
389 #if HAVE_MMX
390 static void RTjpeg_quant_init(void)
392 int i;
393 __s16 *qtbl;
395 qtbl=(__s16 *)RTjpeg_lqt;
396 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i];
398 qtbl=(__s16 *)RTjpeg_cqt;
399 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i];
402 static mmx_t RTjpeg_ones={0x0001000100010001LL};
403 static mmx_t RTjpeg_half={0x7fff7fff7fff7fffLL};
405 static void RTjpeg_quant(__s16 *block, __s32 *qtbl)
407 int i;
408 mmx_t *bl, *ql;
410 ql=(mmx_t *)qtbl;
411 bl=(mmx_t *)block;
413 movq_m2r(RTjpeg_ones, mm6);
414 movq_m2r(RTjpeg_half, mm7);
416 for(i=16; i; i--)
418 movq_m2r(*(ql++), mm0); /* quant vals (4) */
419 movq_m2r(*bl, mm2); /* block vals (4) */
420 movq_r2r(mm0, mm1);
421 movq_r2r(mm2, mm3);
423 punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
424 punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
426 punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
427 punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
429 pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
430 pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
432 psrad_i2r(16, mm0);
433 psrad_i2r(16, mm1);
435 packssdw_r2r(mm1, mm0);
437 movq_r2m(mm0, *(bl++));
441 #else
442 static void RTjpeg_quant_init(void)
446 static void RTjpeg_quant(__s16 *block, __s32 *qtbl)
448 int i;
450 for(i=0; i<64; i++)
451 block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16);
453 #endif
456 * Perform the forward DCT on one block of samples.
458 #if HAVE_MMX
459 static mmx_t RTjpeg_C4 ={0x2D412D412D412D41LL};
460 static mmx_t RTjpeg_C6 ={0x187E187E187E187ELL};
461 static mmx_t RTjpeg_C2mC6={0x22A322A322A322A3LL};
462 static mmx_t RTjpeg_C2pC6={0x539F539F539F539FLL};
463 static mmx_t RTjpeg_zero ={0x0000000000000000LL};
465 #else
467 #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */
468 #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */
469 #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */
470 #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */
472 #define DESCALE10(x) (__s16)( ((x)+128) >> 8)
473 #define DESCALE20(x) (__s16)(((x)+32768) >> 16)
474 #define D_MULTIPLY(var,const) ((__s32) ((var) * (const)))
475 #endif
477 static void RTjpeg_dct_init(void)
479 int i;
481 for(i=0; i<64; i++)
483 RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]);
484 RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]);
488 static void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip)
490 #if !HAVE_MMX
491 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
492 __s32 tmp10, tmp11, tmp12, tmp13;
493 __s32 z1, z2, z3, z4, z5, z11, z13;
494 __u8 *idataptr;
495 __s16 *odataptr;
496 __s32 *wsptr;
497 int ctr;
499 idataptr = idata;
500 wsptr = RTjpeg_ws;
501 for (ctr = 7; ctr >= 0; ctr--) {
502 tmp0 = idataptr[0] + idataptr[7];
503 tmp7 = idataptr[0] - idataptr[7];
504 tmp1 = idataptr[1] + idataptr[6];
505 tmp6 = idataptr[1] - idataptr[6];
506 tmp2 = idataptr[2] + idataptr[5];
507 tmp5 = idataptr[2] - idataptr[5];
508 tmp3 = idataptr[3] + idataptr[4];
509 tmp4 = idataptr[3] - idataptr[4];
511 tmp10 = (tmp0 + tmp3); /* phase 2 */
512 tmp13 = tmp0 - tmp3;
513 tmp11 = (tmp1 + tmp2);
514 tmp12 = tmp1 - tmp2;
516 wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
517 wsptr[4] = (tmp10 - tmp11)<<8;
519 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
520 wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
521 wsptr[6] = (tmp13<<8) - z1;
523 tmp10 = tmp4 + tmp5; /* phase 2 */
524 tmp11 = tmp5 + tmp6;
525 tmp12 = tmp6 + tmp7;
527 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
528 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
529 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
530 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
532 z11 = (tmp7<<8) + z3; /* phase 5 */
533 z13 = (tmp7<<8) - z3;
535 wsptr[5] = z13 + z2; /* phase 6 */
536 wsptr[3] = z13 - z2;
537 wsptr[1] = z11 + z4;
538 wsptr[7] = z11 - z4;
540 idataptr += rskip<<3; /* advance pointer to next row */
541 wsptr += 8;
544 wsptr = RTjpeg_ws;
545 odataptr=odata;
546 for (ctr = 7; ctr >= 0; ctr--) {
547 tmp0 = wsptr[0] + wsptr[56];
548 tmp7 = wsptr[0] - wsptr[56];
549 tmp1 = wsptr[8] + wsptr[48];
550 tmp6 = wsptr[8] - wsptr[48];
551 tmp2 = wsptr[16] + wsptr[40];
552 tmp5 = wsptr[16] - wsptr[40];
553 tmp3 = wsptr[24] + wsptr[32];
554 tmp4 = wsptr[24] - wsptr[32];
556 tmp10 = tmp0 + tmp3; /* phase 2 */
557 tmp13 = tmp0 - tmp3;
558 tmp11 = tmp1 + tmp2;
559 tmp12 = tmp1 - tmp2;
561 odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
562 odataptr[32] = DESCALE10(tmp10 - tmp11);
564 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
565 odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
566 odataptr[48] = DESCALE20((tmp13<<8) - z1);
568 tmp10 = tmp4 + tmp5; /* phase 2 */
569 tmp11 = tmp5 + tmp6;
570 tmp12 = tmp6 + tmp7;
572 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
573 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
574 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
575 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
577 z11 = (tmp7<<8) + z3; /* phase 5 */
578 z13 = (tmp7<<8) - z3;
580 odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
581 odataptr[24] = DESCALE20(z13 - z2);
582 odataptr[8] = DESCALE20(z11 + z4);
583 odataptr[56] = DESCALE20(z11 - z4);
585 odataptr++; /* advance pointer to next column */
586 wsptr++;
588 #else
589 volatile mmx_t tmp6, tmp7;
590 register mmx_t *dataptr = (mmx_t *)odata;
591 mmx_t *idata2 = (mmx_t *)idata;
593 // first copy the input 8 bit to the destination 16 bits
595 movq_m2r(RTjpeg_zero, mm2);
598 movq_m2r(*idata2, mm0);
599 movq_r2r(mm0, mm1);
601 punpcklbw_r2r(mm2, mm0);
602 movq_r2m(mm0, *(dataptr));
604 punpckhbw_r2r(mm2, mm1);
605 movq_r2m(mm1, *(dataptr+1));
607 idata2 += rskip;
609 movq_m2r(*idata2, mm0);
610 movq_r2r(mm0, mm1);
612 punpcklbw_r2r(mm2, mm0);
613 movq_r2m(mm0, *(dataptr+2));
615 punpckhbw_r2r(mm2, mm1);
616 movq_r2m(mm1, *(dataptr+3));
618 idata2 += rskip;
620 movq_m2r(*idata2, mm0);
621 movq_r2r(mm0, mm1);
623 punpcklbw_r2r(mm2, mm0);
624 movq_r2m(mm0, *(dataptr+4));
626 punpckhbw_r2r(mm2, mm1);
627 movq_r2m(mm1, *(dataptr+5));
629 idata2 += rskip;
631 movq_m2r(*idata2, mm0);
632 movq_r2r(mm0, mm1);
634 punpcklbw_r2r(mm2, mm0);
635 movq_r2m(mm0, *(dataptr+6));
637 punpckhbw_r2r(mm2, mm1);
638 movq_r2m(mm1, *(dataptr+7));
640 idata2 += rskip;
642 movq_m2r(*idata2, mm0);
643 movq_r2r(mm0, mm1);
645 punpcklbw_r2r(mm2, mm0);
646 movq_r2m(mm0, *(dataptr+8));
648 punpckhbw_r2r(mm2, mm1);
649 movq_r2m(mm1, *(dataptr+9));
651 idata2 += rskip;
653 movq_m2r(*idata2, mm0);
654 movq_r2r(mm0, mm1);
656 punpcklbw_r2r(mm2, mm0);
657 movq_r2m(mm0, *(dataptr+10));
659 punpckhbw_r2r(mm2, mm1);
660 movq_r2m(mm1, *(dataptr+11));
662 idata2 += rskip;
664 movq_m2r(*idata2, mm0);
665 movq_r2r(mm0, mm1);
667 punpcklbw_r2r(mm2, mm0);
668 movq_r2m(mm0, *(dataptr+12));
670 punpckhbw_r2r(mm2, mm1);
671 movq_r2m(mm1, *(dataptr+13));
673 idata2 += rskip;
675 movq_m2r(*idata2, mm0);
676 movq_r2r(mm0, mm1);
678 punpcklbw_r2r(mm2, mm0);
679 movq_r2m(mm0, *(dataptr+14));
681 punpckhbw_r2r(mm2, mm1);
682 movq_r2m(mm1, *(dataptr+15));
684 /* Start Transpose to do calculations on rows */
686 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5
688 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
689 movq_r2r(mm7, mm5);
691 punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines
692 movq_r2r(mm6, mm2);
694 punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines
695 movq_r2r(mm7, mm1);
697 movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line
698 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
700 movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line
701 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
703 movq_r2m(mm7,*(dataptr+9)); // write result 1
704 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
706 movq_r2m(mm1,*(dataptr+11)); // write result 2
707 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
709 movq_r2r(mm5, mm1);
710 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
712 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
713 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
715 movq_r2m(mm5,*(dataptr+13)); // write result 3
717 // last 4x4 done
719 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
721 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
722 movq_r2r(mm0, mm6);
724 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
725 movq_r2r(mm2, mm7);
727 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
728 movq_r2r(mm0, mm4);
731 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
732 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
734 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
735 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
737 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
738 movq_r2r(mm1, mm2); // copy first line
740 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
741 movq_r2r(mm6, mm5); // copy first intermediate result
743 movq_r2m(mm0, *(dataptr+8)); // write result 1
744 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
746 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
747 movq_r2r(mm3, mm0); // copy third line
749 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
751 movq_r2m(mm4, *(dataptr+10)); // write result 2 out
752 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
754 punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines
755 movq_r2r(mm1, mm4);
757 movq_r2m(mm6, *(dataptr+12)); // write result 3 out
758 punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result
760 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
761 movq_r2r(mm2, mm6);
763 movq_r2m(mm5, *(dataptr+14)); // write result 4 out
764 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
766 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
767 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
769 movq_r2m(mm4, *(dataptr+3)); // write result 6 out
770 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
772 movq_r2m(mm2, *(dataptr+5)); // write result 7 out
774 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
776 movq_r2m(mm6, *(dataptr+7)); // write result 8 out
779 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
781 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
782 movq_r2r(mm0, mm2);
784 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
785 movq_r2r(mm7, mm4);
787 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
788 movq_r2r(mm0, mm1);
790 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
791 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
793 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
794 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
796 movq_r2r(mm0, mm7); // write result 1
797 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
799 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
800 movq_r2r(mm1, mm6); // write result 2
802 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
803 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
805 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
806 movq_r2r(mm2, mm3); // copy first intermediate result
808 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
809 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
811 movq_r2m(mm7, tmp7);
812 movq_r2r(mm2, mm5); // write result 3
814 movq_r2m(mm6, tmp6);
815 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
817 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */
818 movq_r2r(mm3, mm4); // write result 4
820 /************************************************************************************************
821 End of Transpose
822 ************************************************************************************************/
825 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
826 movq_r2r(mm0, mm7);
828 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
829 movq_r2r(mm1, mm6);
831 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
832 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
834 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
835 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
837 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
838 paddw_r2r(mm7, mm6); // tmp12 + tmp13
840 /* stage 3 */
842 movq_m2r(tmp6, mm2);
843 movq_r2r(mm0, mm3);
845 psllw_i2r(2, mm6); // m8 * 2^2
846 paddw_r2r(mm1, mm0);
848 pmulhw_m2r(RTjpeg_C4, mm6); // z1
849 psubw_r2r(mm1, mm3);
851 movq_r2m(mm0, *dataptr);
852 movq_r2r(mm7, mm0);
854 /* Odd part */
855 movq_r2m(mm3, *(dataptr+8));
856 paddw_r2r(mm5, mm4); // tmp10
858 movq_m2r(tmp7, mm3);
859 paddw_r2r(mm6, mm0); // tmp32
861 paddw_r2r(mm2, mm5); // tmp11
862 psubw_r2r(mm6, mm7); // tmp33
864 movq_r2m(mm0, *(dataptr+4));
865 paddw_r2r(mm3, mm2); // tmp12
867 /* stage 4 */
869 movq_r2m(mm7, *(dataptr+12));
870 movq_r2r(mm4, mm1); // copy of tmp10
872 psubw_r2r(mm2, mm1); // tmp10 - tmp12
873 psllw_i2r(2, mm4); // m8 * 2^2
875 movq_m2r(RTjpeg_C2mC6, mm0);
876 psllw_i2r(2, mm1);
878 pmulhw_m2r(RTjpeg_C6, mm1); // z5
879 psllw_i2r(2, mm2);
881 pmulhw_r2r(mm0, mm4); // z5
883 /* stage 5 */
885 pmulhw_m2r(RTjpeg_C2pC6, mm2);
886 psllw_i2r(2, mm5);
888 pmulhw_m2r(RTjpeg_C4, mm5); // z3
889 movq_r2r(mm3, mm0); // copy tmp7
891 movq_m2r(*(dataptr+1), mm7);
892 paddw_r2r(mm1, mm4); // z2
894 paddw_r2r(mm1, mm2); // z4
896 paddw_r2r(mm5, mm0); // z11
897 psubw_r2r(mm5, mm3); // z13
899 /* stage 6 */
901 movq_r2r(mm3, mm5); // copy z13
902 psubw_r2r(mm4, mm3); // y3=z13 - z2
904 paddw_r2r(mm4, mm5); // y5=z13 + z2
905 movq_r2r(mm0, mm6); // copy z11
907 movq_r2m(mm3, *(dataptr+6)); //save y3
908 psubw_r2r(mm2, mm0); // y7=z11 - z4
910 movq_r2m(mm5, *(dataptr+10)); //save y5
911 paddw_r2r(mm2, mm6); // y1=z11 + z4
913 movq_r2m(mm0, *(dataptr+14)); //save y7
915 /************************************************
916 * End of 1st 4 rows
917 ************************************************/
919 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
920 movq_r2r(mm7, mm0); // copy x0
922 movq_r2m(mm6, *(dataptr+2)); //save y1
924 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
925 movq_r2r(mm1, mm6); // copy x1
927 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
929 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
930 movq_r2r(mm2, mm5); // copy x2
932 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
933 movq_r2r(mm3, mm4); // copy x3
935 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
937 movq_r2m(mm7, tmp7); // save tmp07
938 movq_r2r(mm0, mm7); // copy tmp00
940 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
942 /* stage 2, Even Part */
944 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
946 movq_r2m(mm6, tmp6); // save tmp07
947 movq_r2r(mm1, mm6); // copy tmp01
949 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
950 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
952 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
954 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
955 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
957 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
959 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
960 paddw_r2r(mm7, mm6); // tmp12 + tmp13
962 /* stage 3, Even and stage 4 & 5 even */
964 movq_m2r(tmp6, mm2); // load tmp6
965 movq_r2r(mm0, mm3); // copy tmp10
967 psllw_i2r(2, mm6); // shift z1
968 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
970 pmulhw_m2r(RTjpeg_C4, mm6); // z1
971 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
973 movq_r2m(mm0, *(dataptr+1)); //save y0
974 movq_r2r(mm7, mm0); // copy tmp13
976 /* odd part */
978 movq_r2m(mm3, *(dataptr+9)); //save y4
979 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
981 movq_m2r(tmp7, mm3); // load tmp7
982 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
984 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
985 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
987 movq_r2m(mm0, *(dataptr+5)); //save y2
988 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
990 /* stage 4 */
992 movq_r2m(mm7, *(dataptr+13)); //save y6
993 movq_r2r(mm4, mm1); // copy tmp10
995 psubw_r2r(mm2, mm1); // tmp10 - tmp12
996 psllw_i2r(2, mm4); // shift tmp10
998 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
999 psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1001 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1002 psllw_i2r(2, mm5); // prepare for multiply
1004 pmulhw_r2r(mm0, mm4); // multiply by converted real
1006 /* stage 5 */
1008 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1009 psllw_i2r(2, mm2); // prepare for multiply
1011 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1012 movq_r2r(mm3, mm0); // copy tmp7
1014 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1015 paddw_r2r(mm1, mm4); // z2
1017 paddw_r2r(mm5, mm0); // z11
1018 psubw_r2r(mm5, mm3); // z13
1020 /* stage 6 */
1022 movq_r2r(mm3, mm5); // copy z13
1023 paddw_r2r(mm1, mm2); // z4
1025 movq_r2r(mm0, mm6); // copy z11
1026 psubw_r2r(mm4, mm5); // y3
1028 paddw_r2r(mm2, mm6); // y1
1029 paddw_r2r(mm4, mm3); // y5
1031 movq_r2m(mm5, *(dataptr+7)); //save y3
1033 movq_r2m(mm6, *(dataptr+3)); //save y1
1034 psubw_r2r(mm2, mm0); // y7
1036 /************************************************************************************************
1037 Start of Transpose
1038 ************************************************************************************************/
1040 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
1041 movq_r2r(mm7, mm5); // copy first line
1043 punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines
1044 movq_r2r(mm6, mm2); // copy third line
1046 punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines
1047 movq_r2r(mm7, mm1); // copy first intermediate result
1049 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
1051 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1053 movq_r2m(mm7, *(dataptr+9)); // write result 1
1054 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
1056 movq_r2m(mm1, *(dataptr+11)); // write result 2
1057 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
1059 movq_r2r(mm5, mm1); // copy first intermediate result
1060 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
1062 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
1063 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
1065 movq_r2m(mm5, *(dataptr+13)); // write result 3
1067 /****** last 4x4 done */
1069 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
1071 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
1072 movq_r2r(mm0, mm6); // copy first line
1074 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
1075 movq_r2r(mm2, mm7); // copy third line
1077 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
1078 movq_r2r(mm0, mm4); // copy first intermediate result
1082 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
1083 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
1085 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
1086 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
1088 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
1089 movq_r2r(mm1, mm2); // copy first line
1091 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
1092 movq_r2r(mm6, mm5); // copy first intermediate result
1094 movq_r2m(mm0, *(dataptr+8)); // write result 1
1095 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
1097 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
1098 movq_r2r(mm3, mm0); // copy third line
1100 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
1102 movq_r2m(mm4, *(dataptr+10)); // write result 2 out
1103 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
1105 punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines
1106 movq_r2r(mm1, mm4); // copy second intermediate result
1108 movq_r2m(mm6, *(dataptr+12)); // write result 3 out
1109 punpckldq_r2r(mm3, mm1); //
1111 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
1112 movq_r2r(mm2, mm6); // copy second intermediate result
1114 movq_r2m(mm5, *(dataptr+14)); // write result 4 out
1115 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
1117 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
1118 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
1120 movq_r2m(mm4, *(dataptr+3)); // write result 6 out
1121 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
1123 movq_r2m(mm2, *(dataptr+5)); // write result 7 out
1125 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
1127 movq_r2m(mm6, *(dataptr+7)); // write result 8 out
1129 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
1131 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
1132 movq_r2r(mm0, mm2); // copy first line
1134 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
1135 movq_r2r(mm7, mm4); // copy third line
1137 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
1138 movq_r2r(mm0, mm1); // copy first intermediate result
1140 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
1141 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
1143 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
1144 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1146 movq_r2r(mm0, mm7); // write result 1
1147 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
1149 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
1150 movq_r2r(mm1, mm6); // write result 2
1152 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
1153 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
1155 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
1156 movq_r2r(mm2, mm3); // copy first intermediate result
1158 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
1159 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
1161 movq_r2m(mm7, tmp7); // save tmp07
1162 movq_r2r(mm2, mm5); // write result 3
1164 movq_r2m(mm6, tmp6); // save tmp06
1166 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
1168 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */
1169 movq_r2r(mm3, mm4); // write result 4
1171 /************************************************************************************************
1172 End of Transpose 2
1173 ************************************************************************************************/
1175 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
1176 movq_r2r(mm0, mm7);
1178 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
1179 movq_r2r(mm1, mm6);
1181 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
1182 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
1184 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
1185 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
1187 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
1188 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1190 /* stage 3 */
1192 movq_m2r(tmp6, mm2);
1193 movq_r2r(mm0, mm3);
1195 psllw_i2r(2, mm6); // m8 * 2^2
1196 paddw_r2r(mm1, mm0);
1198 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1199 psubw_r2r(mm1, mm3);
1201 movq_r2m(mm0, *dataptr);
1202 movq_r2r(mm7, mm0);
1204 /* Odd part */
1205 movq_r2m(mm3, *(dataptr+8));
1206 paddw_r2r(mm5, mm4); // tmp10
1208 movq_m2r(tmp7, mm3);
1209 paddw_r2r(mm6, mm0); // tmp32
1211 paddw_r2r(mm2, mm5); // tmp11
1212 psubw_r2r(mm6, mm7); // tmp33
1214 movq_r2m(mm0, *(dataptr+4));
1215 paddw_r2r(mm3, mm2); // tmp12
1217 /* stage 4 */
1218 movq_r2m(mm7, *(dataptr+12));
1219 movq_r2r(mm4, mm1); // copy of tmp10
1221 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1222 psllw_i2r(2, mm4); // m8 * 2^2
1224 movq_m2r(RTjpeg_C2mC6, mm0);
1225 psllw_i2r(2, mm1);
1227 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1228 psllw_i2r(2, mm2);
1230 pmulhw_r2r(mm0, mm4); // z5
1232 /* stage 5 */
1234 pmulhw_m2r(RTjpeg_C2pC6, mm2);
1235 psllw_i2r(2, mm5);
1237 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1238 movq_r2r(mm3, mm0); // copy tmp7
1240 movq_m2r(*(dataptr+1), mm7);
1241 paddw_r2r(mm1, mm4); // z2
1243 paddw_r2r(mm1, mm2); // z4
1245 paddw_r2r(mm5, mm0); // z11
1246 psubw_r2r(mm5, mm3); // z13
1248 /* stage 6 */
1250 movq_r2r(mm3, mm5); // copy z13
1251 psubw_r2r(mm4, mm3); // y3=z13 - z2
1253 paddw_r2r(mm4, mm5); // y5=z13 + z2
1254 movq_r2r(mm0, mm6); // copy z11
1256 movq_r2m(mm3, *(dataptr+6)); //save y3
1257 psubw_r2r(mm2, mm0); // y7=z11 - z4
1259 movq_r2m(mm5, *(dataptr+10)); //save y5
1260 paddw_r2r(mm2, mm6); // y1=z11 + z4
1262 movq_r2m(mm0, *(dataptr+14)); //save y7
1264 /************************************************
1265 * End of 1st 4 rows
1266 ************************************************/
1268 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1269 movq_r2r(mm7, mm0); // copy x0
1271 movq_r2m(mm6, *(dataptr+2)); //save y1
1273 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1274 movq_r2r(mm1, mm6); // copy x1
1276 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1278 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1279 movq_r2r(mm2, mm5); // copy x2
1281 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1282 movq_r2r(mm3, mm4); // copy x3
1284 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1286 movq_r2m(mm7, tmp7); // save tmp07
1287 movq_r2r(mm0, mm7); // copy tmp00
1289 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1291 /* stage 2, Even Part */
1293 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1295 movq_r2m(mm6, tmp6); // save tmp07
1296 movq_r2r(mm1, mm6); // copy tmp01
1298 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1299 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1301 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1303 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1304 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1306 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1308 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1309 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1311 /* stage 3, Even and stage 4 & 5 even */
1313 movq_m2r(tmp6, mm2); // load tmp6
1314 movq_r2r(mm0, mm3); // copy tmp10
1316 psllw_i2r(2, mm6); // shift z1
1317 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1319 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1320 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1322 movq_r2m(mm0, *(dataptr+1)); //save y0
1323 movq_r2r(mm7, mm0); // copy tmp13
1325 /* odd part */
1327 movq_r2m(mm3, *(dataptr+9)); //save y4
1328 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1330 movq_m2r(tmp7, mm3); // load tmp7
1331 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1333 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1334 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1336 movq_r2m(mm0, *(dataptr+5)); //save y2
1337 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1339 /* stage 4 */
1341 movq_r2m(mm7, *(dataptr+13)); //save y6
1342 movq_r2r(mm4, mm1); // copy tmp10
1344 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1345 psllw_i2r(2, mm4); // shift tmp10
1347 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1348 psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1350 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1351 psllw_i2r(2, mm5); // prepare for multiply
1353 pmulhw_r2r(mm0, mm4); // multiply by converted real
1355 /* stage 5 */
1357 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1358 psllw_i2r(2, mm2); // prepare for multiply
1360 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1361 movq_r2r(mm3, mm0); // copy tmp7
1363 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1364 paddw_r2r(mm1, mm4); // z2
1366 paddw_r2r(mm5, mm0); // z11
1367 psubw_r2r(mm5, mm3); // z13
1369 /* stage 6 */
1371 movq_r2r(mm3, mm5); // copy z13
1372 paddw_r2r(mm1, mm2); // z4
1374 movq_r2r(mm0, mm6); // copy z11
1375 psubw_r2r(mm4, mm5); // y3
1377 paddw_r2r(mm2, mm6); // y1
1378 paddw_r2r(mm4, mm3); // y5
1380 movq_r2m(mm5, *(dataptr+7)); //save y3
1381 psubw_r2r(mm2, mm0); // yè=z11 - z4
1383 movq_r2m(mm3, *(dataptr+11)); //save y5
1385 movq_r2m(mm6, *(dataptr+3)); //save y1
1387 movq_r2m(mm0, *(dataptr+15)); //save y7
1390 #endif
1395 Main Routines
1397 This file contains most of the initialisation and control functions
1399 (C) Justin Schoeman 1998
1405 Private function
1407 Initialise all the cache-aliged data blocks
1411 static void RTjpeg_init_data(void)
1413 unsigned long dptr;
1415 dptr=(unsigned long)&(RTjpeg_alldata[0]);
1416 dptr+=32;
1417 dptr=dptr>>5;
1418 dptr=dptr<<5; /* cache align data */
1420 RTjpeg_block=(__s16 *)dptr;
1421 dptr+=sizeof(__s16)*64;
1422 RTjpeg_lqt=(__s32 *)dptr;
1423 dptr+=sizeof(__s32)*64;
1424 RTjpeg_cqt=(__s32 *)dptr;
1425 dptr+=sizeof(__s32)*64;
1426 RTjpeg_liqt=(__u32 *)dptr;
1427 dptr+=sizeof(__u32)*64;
1428 RTjpeg_ciqt=(__u32 *)dptr;
1433 External Function
1435 Re-set quality factor
1437 Input: buf -> pointer to 128 ints for quant values store to pass back to
1438 init_decompress.
1439 Q -> quality factor (192=best, 32=worst)
1442 static void RTjpeg_init_Q(__u8 Q)
1444 int i;
1445 __u64 qual;
1447 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
1449 for(i=0; i<64; i++)
1451 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
1452 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
1453 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
1454 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
1455 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
1456 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
1457 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
1458 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
1461 RTjpeg_lb8=0;
1462 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
1463 RTjpeg_lb8--;
1464 RTjpeg_cb8=0;
1465 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
1466 RTjpeg_cb8--;
1468 RTjpeg_dct_init();
1469 RTjpeg_quant_init();
1474 External Function
1476 Initialise compression.
1478 Input: buf -> pointer to 128 ints for quant values store to pass back to
1479 init_decompress.
1480 width -> width of image
1481 height -> height of image
1482 Q -> quality factor (192=best, 32=worst)
1486 void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q)
1488 int i;
1489 __u64 qual;
1491 RTjpeg_init_data();
1493 RTjpeg_width=width;
1494 RTjpeg_height=height;
1495 RTjpeg_Ywidth = RTjpeg_width>>3;
1496 RTjpeg_Ysize=width * height;
1497 RTjpeg_Cwidth = RTjpeg_width>>4;
1498 RTjpeg_Csize= (width>>1) * height;
1500 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
1502 for(i=0; i<64; i++)
1504 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
1505 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
1506 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
1507 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
1508 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
1509 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
1510 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
1511 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
1514 RTjpeg_lb8=0;
1515 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
1516 RTjpeg_lb8--;
1517 RTjpeg_cb8=0;
1518 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
1519 RTjpeg_cb8--;
1521 RTjpeg_dct_init();
1522 RTjpeg_quant_init();
1524 for(i=0; i<64; i++)
1525 buf[i]=le2me_32(RTjpeg_liqt[i]);
1526 for(i=0; i<64; i++)
1527 buf[64+i]=le2me_32(RTjpeg_ciqt[i]);
1530 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp)
1532 __s8 * sb;
1533 register __s8 * bp1 = bp + (RTjpeg_width<<3);
1534 register __s8 * bp2 = bp + RTjpeg_Ysize;
1535 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
1536 register int i, j, k;
1538 #if HAVE_MMX
1539 emms();
1540 #endif
1541 sb=sp;
1542 /* Y */
1543 for(i=RTjpeg_height>>1; i; i-=8)
1545 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
1547 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
1548 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
1549 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
1551 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
1552 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
1553 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
1555 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
1556 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
1557 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
1559 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
1560 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
1561 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
1563 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
1564 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
1565 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
1567 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
1568 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
1569 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
1572 bp+=RTjpeg_width<<4;
1573 bp1+=RTjpeg_width<<4;
1574 bp2+=RTjpeg_width<<2;
1575 bp3+=RTjpeg_width<<2;
1578 #if HAVE_MMX
1579 emms();
1580 #endif
1581 return (sp-sb);
1585 External Function
1587 Initialise additional data structures for motion compensation
1591 void RTjpeg_init_mcompress(void)
1593 unsigned long tmp;
1595 if(!RTjpeg_old)
1597 RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32);
1598 tmp=(unsigned long)RTjpeg_old;
1599 tmp+=32;
1600 tmp=tmp>>5;
1601 RTjpeg_old=(__s16 *)(tmp<<5);
1603 if (!RTjpeg_old)
1605 fprintf(stderr, "RTjpeg: Could not allocate memory\n");
1606 exit(-1);
1608 memset(RTjpeg_old, 0, ((4*RTjpeg_width*RTjpeg_height)));
1611 #if HAVE_MMX
1613 static int RTjpeg_bcomp(__s16 *old, mmx_t *mask)
1615 int i;
1616 mmx_t *mold=(mmx_t *)old;
1617 mmx_t *mblock=(mmx_t *)RTjpeg_block;
1618 volatile mmx_t result;
1619 static mmx_t neg={0xffffffffffffffffULL};
1621 movq_m2r(*mask, mm7);
1622 movq_m2r(neg, mm6);
1623 pxor_r2r(mm5, mm5);
1625 for(i=0; i<8; i++)
1627 movq_m2r(*(mblock++), mm0);
1628 movq_m2r(*(mblock++), mm2);
1629 movq_m2r(*(mold++), mm1);
1630 movq_m2r(*(mold++), mm3);
1631 psubsw_r2r(mm1, mm0);
1632 psubsw_r2r(mm3, mm2);
1633 movq_r2r(mm0, mm1);
1634 movq_r2r(mm2, mm3);
1635 pcmpgtw_r2r(mm7, mm0);
1636 pcmpgtw_r2r(mm7, mm2);
1637 pxor_r2r(mm6, mm1);
1638 pxor_r2r(mm6, mm3);
1639 pcmpgtw_r2r(mm7, mm1);
1640 pcmpgtw_r2r(mm7, mm3);
1641 por_r2r(mm0, mm5);
1642 por_r2r(mm2, mm5);
1643 por_r2r(mm1, mm5);
1644 por_r2r(mm3, mm5);
1646 movq_r2m(mm5, result);
1648 if(result.q)
1650 return 0;
1652 return 1;
1655 #else
1656 static int RTjpeg_bcomp(__s16 *old, __u16 *mask)
1658 int i;
1660 for(i=0; i<64; i++)
1661 if(abs(old[i]-RTjpeg_block[i])>*mask)
1663 for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
1664 return 0;
1666 return 1;
1668 #endif
1670 int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
1672 __s8 * sb;
1673 register __s8 * bp1 = bp + (RTjpeg_width<<3);
1674 register __s8 * bp2 = bp + RTjpeg_Ysize;
1675 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
1676 register int i, j, k;
1678 #if HAVE_MMX
1679 emms();
1680 RTjpeg_lmask.uq=((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask;
1681 RTjpeg_cmask.uq=((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask;
1682 #else
1683 RTjpeg_lmask=lmask;
1684 RTjpeg_cmask=cmask;
1685 #endif
1687 sb=sp;
1688 block=RTjpeg_old;
1689 /* Y */
1690 for(i=RTjpeg_height>>1; i; i-=8)
1692 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
1694 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
1695 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
1696 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
1698 *((__u8 *)sp++)=255;
1700 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
1701 block+=64;
1703 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
1704 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
1705 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
1707 *((__u8 *)sp++)=255;
1709 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
1710 block+=64;
1712 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
1713 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
1714 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
1716 *((__u8 *)sp++)=255;
1718 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
1719 block+=64;
1721 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
1722 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
1723 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
1725 *((__u8 *)sp++)=255;
1727 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
1728 block+=64;
1730 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
1731 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
1732 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
1734 *((__u8 *)sp++)=255;
1736 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
1737 block+=64;
1739 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
1740 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
1741 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
1743 *((__u8 *)sp++)=255;
1745 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
1746 block+=64;
1748 bp+=RTjpeg_width<<4;
1749 bp1+=RTjpeg_width<<4;
1750 bp2+=RTjpeg_width<<2;
1751 bp3+=RTjpeg_width<<2;
1754 #if HAVE_MMX
1755 emms();
1756 #endif
1757 return (sp-sb);