input.c: Use talloc for all memory allocations
[mplayer/glamo.git] / libmpeg2 / motion_comp_mmx.c
blob4f2aacfc844f26709672e8b0b2640be071a8ed48
1 /*
2 * motion_comp_mmx.c
3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
7 * See http://libmpeg2.sourceforge.net/ for updates.
9 * mpeg2dec is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * mpeg2dec is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Modified for use with MPlayer, see libmpeg-0.4.1.diff for the exact changes.
24 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/
25 * $Id$
28 #include "config.h"
30 #if defined(ARCH_X86) || defined(ARCH_X86_64)
32 #include <inttypes.h>
34 #include "mpeg2.h"
35 #include "attributes.h"
36 #include "mpeg2_internal.h"
37 #include "mmx.h"
39 #define CPU_MMXEXT 0
40 #define CPU_3DNOW 1
43 /* MMX code - needs a rewrite */
46 * Motion Compensation frequently needs to average values using the
47 * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction
48 * to compute this, but it's been left out of classic MMX.
50 * We need to be careful of overflows when doing this computation.
51 * Rather than unpacking data to 16-bits, which reduces parallelism,
52 * we use the following formulas:
54 * (x+y)>>1 == (x&y)+((x^y)>>1)
55 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
58 /* some rounding constants */
59 static mmx_t mask1 = {0xfefefefefefefefeLL};
60 static mmx_t round4 = {0x0002000200020002LL};
63 * This code should probably be compiled with loop unrolling
64 * (ie, -funroll-loops in gcc)becuase some of the loops
65 * use a small static number of iterations. This was written
66 * with the assumption the compiler knows best about when
67 * unrolling will help
70 static inline void mmx_zero_reg ()
72 /* load 0 into mm0 */
73 pxor_r2r (mm0, mm0);
76 static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1,
77 const uint8_t * src2)
79 /* *dest = (*src1 + *src2 + 1)/ 2; */
81 movq_m2r (*src1, mm1); /* load 8 src1 bytes */
82 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */
84 movq_m2r (*src2, mm3); /* load 8 src2 bytes */
85 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */
87 pxor_r2r (mm1, mm3); /* xor src1 and src2 */
88 pand_m2r (mask1, mm3); /* mask lower bits */
89 psrlq_i2r (1, mm3); /* /2 */
90 por_r2r (mm2, mm4); /* or src1 and src2 */
91 psubb_r2r (mm3, mm4); /* subtract subresults */
92 movq_r2m (mm4, *dest); /* store result in dest */
95 static inline void mmx_interp_average_2_U8 (uint8_t * dest,
96 const uint8_t * src1,
97 const uint8_t * src2)
99 /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */
101 movq_m2r (*dest, mm1); /* load 8 dest bytes */
102 movq_r2r (mm1, mm2); /* copy 8 dest bytes */
104 movq_m2r (*src1, mm3); /* load 8 src1 bytes */
105 movq_r2r (mm3, mm4); /* copy 8 src1 bytes */
107 movq_m2r (*src2, mm5); /* load 8 src2 bytes */
108 movq_r2r (mm5, mm6); /* copy 8 src2 bytes */
110 pxor_r2r (mm3, mm5); /* xor src1 and src2 */
111 pand_m2r (mask1, mm5); /* mask lower bits */
112 psrlq_i2r (1, mm5); /* /2 */
113 por_r2r (mm4, mm6); /* or src1 and src2 */
114 psubb_r2r (mm5, mm6); /* subtract subresults */
115 movq_r2r (mm6, mm5); /* copy subresult */
117 pxor_r2r (mm1, mm5); /* xor srcavg and dest */
118 pand_m2r (mask1, mm5); /* mask lower bits */
119 psrlq_i2r (1, mm5); /* /2 */
120 por_r2r (mm2, mm6); /* or srcavg and dest */
121 psubb_r2r (mm5, mm6); /* subtract subresults */
122 movq_r2m (mm6, *dest); /* store result in dest */
125 static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
126 const uint8_t * src2,
127 const uint8_t * src3,
128 const uint8_t * src4)
130 /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */
132 movq_m2r (*src1, mm1); /* load 8 src1 bytes */
133 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */
135 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */
136 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */
138 movq_m2r (*src2, mm3); /* load 8 src2 bytes */
139 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */
141 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */
142 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */
144 paddw_r2r (mm3, mm1); /* add lows */
145 paddw_r2r (mm4, mm2); /* add highs */
147 /* now have partials in mm1 and mm2 */
149 movq_m2r (*src3, mm3); /* load 8 src3 bytes */
150 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */
152 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */
153 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */
155 paddw_r2r (mm3, mm1); /* add lows */
156 paddw_r2r (mm4, mm2); /* add highs */
158 movq_m2r (*src4, mm5); /* load 8 src4 bytes */
159 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */
161 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */
162 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */
164 paddw_r2r (mm5, mm1); /* add lows */
165 paddw_r2r (mm6, mm2); /* add highs */
167 /* now have subtotal in mm1 and mm2 */
169 paddw_m2r (round4, mm1);
170 psraw_i2r (2, mm1); /* /4 */
171 paddw_m2r (round4, mm2);
172 psraw_i2r (2, mm2); /* /4 */
174 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */
175 movq_r2m (mm1, *dest); /* store result in dest */
178 static inline void mmx_interp_average_4_U8 (uint8_t * dest,
179 const uint8_t * src1,
180 const uint8_t * src2,
181 const uint8_t * src3,
182 const uint8_t * src4)
184 /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */
186 movq_m2r (*src1, mm1); /* load 8 src1 bytes */
187 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */
189 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */
190 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */
192 movq_m2r (*src2, mm3); /* load 8 src2 bytes */
193 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */
195 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */
196 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */
198 paddw_r2r (mm3, mm1); /* add lows */
199 paddw_r2r (mm4, mm2); /* add highs */
201 /* now have partials in mm1 and mm2 */
203 movq_m2r (*src3, mm3); /* load 8 src3 bytes */
204 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */
206 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */
207 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */
209 paddw_r2r (mm3, mm1); /* add lows */
210 paddw_r2r (mm4, mm2); /* add highs */
212 movq_m2r (*src4, mm5); /* load 8 src4 bytes */
213 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */
215 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */
216 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */
218 paddw_r2r (mm5, mm1); /* add lows */
219 paddw_r2r (mm6, mm2); /* add highs */
221 paddw_m2r (round4, mm1);
222 psraw_i2r (2, mm1); /* /4 */
223 paddw_m2r (round4, mm2);
224 psraw_i2r (2, mm2); /* /4 */
226 /* now have subtotal/4 in mm1 and mm2 */
228 movq_m2r (*dest, mm3); /* load 8 dest bytes */
229 movq_r2r (mm3, mm4); /* copy 8 dest bytes */
231 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */
232 movq_r2r (mm1,mm2); /* copy subresult */
234 pxor_r2r (mm1, mm3); /* xor srcavg and dest */
235 pand_m2r (mask1, mm3); /* mask lower bits */
236 psrlq_i2r (1, mm3); /* /2 */
237 por_r2r (mm2, mm4); /* or srcavg and dest */
238 psubb_r2r (mm3, mm4); /* subtract subresults */
239 movq_r2m (mm4, *dest); /* store result in dest */
242 /*-----------------------------------------------------------------------*/
244 static inline void MC_avg_mmx (const int width, int height, uint8_t * dest,
245 const uint8_t * ref, const int stride)
247 mmx_zero_reg ();
249 do {
250 mmx_average_2_U8 (dest, dest, ref);
252 if (width == 16)
253 mmx_average_2_U8 (dest+8, dest+8, ref+8);
255 dest += stride;
256 ref += stride;
257 } while (--height);
260 static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref,
261 int stride, int height)
263 MC_avg_mmx (16, height, dest, ref, stride);
266 static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref,
267 int stride, int height)
269 MC_avg_mmx (8, height, dest, ref, stride);
272 /*-----------------------------------------------------------------------*/
274 static inline void MC_put_mmx (const int width, int height, uint8_t * dest,
275 const uint8_t * ref, const int stride)
277 mmx_zero_reg ();
279 do {
280 movq_m2r (* ref, mm1); /* load 8 ref bytes */
281 movq_r2m (mm1,* dest); /* store 8 bytes at curr */
283 if (width == 16)
285 movq_m2r (* (ref+8), mm1); /* load 8 ref bytes */
286 movq_r2m (mm1,* (dest+8)); /* store 8 bytes at curr */
289 dest += stride;
290 ref += stride;
291 } while (--height);
294 static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref,
295 int stride, int height)
297 MC_put_mmx (16, height, dest, ref, stride);
300 static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref,
301 int stride, int height)
303 MC_put_mmx (8, height, dest, ref, stride);
306 /*-----------------------------------------------------------------------*/
308 /* Half pixel interpolation in the x direction */
309 static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest,
310 const uint8_t * ref, const int stride)
312 mmx_zero_reg ();
314 do {
315 mmx_interp_average_2_U8 (dest, ref, ref+1);
317 if (width == 16)
318 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
320 dest += stride;
321 ref += stride;
322 } while (--height);
325 static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref,
326 int stride, int height)
328 MC_avg_x_mmx (16, height, dest, ref, stride);
331 static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref,
332 int stride, int height)
334 MC_avg_x_mmx (8, height, dest, ref, stride);
337 /*-----------------------------------------------------------------------*/
339 static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest,
340 const uint8_t * ref, const int stride)
342 mmx_zero_reg ();
344 do {
345 mmx_average_2_U8 (dest, ref, ref+1);
347 if (width == 16)
348 mmx_average_2_U8 (dest+8, ref+8, ref+9);
350 dest += stride;
351 ref += stride;
352 } while (--height);
355 static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref,
356 int stride, int height)
358 MC_put_x_mmx (16, height, dest, ref, stride);
361 static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref,
362 int stride, int height)
364 MC_put_x_mmx (8, height, dest, ref, stride);
367 /*-----------------------------------------------------------------------*/
369 static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest,
370 const uint8_t * ref, const int stride)
372 const uint8_t * ref_next = ref + stride;
374 mmx_zero_reg ();
376 do {
377 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
379 if (width == 16)
380 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
381 ref_next+8, ref_next+9);
383 dest += stride;
384 ref += stride;
385 ref_next += stride;
386 } while (--height);
389 static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
390 int stride, int height)
392 MC_avg_xy_mmx (16, height, dest, ref, stride);
395 static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
396 int stride, int height)
398 MC_avg_xy_mmx (8, height, dest, ref, stride);
401 /*-----------------------------------------------------------------------*/
403 static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest,
404 const uint8_t * ref, const int stride)
406 const uint8_t * ref_next = ref + stride;
408 mmx_zero_reg ();
410 do {
411 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
413 if (width == 16)
414 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
416 dest += stride;
417 ref += stride;
418 ref_next += stride;
419 } while (--height);
422 static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
423 int stride, int height)
425 MC_put_xy_mmx (16, height, dest, ref, stride);
428 static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
429 int stride, int height)
431 MC_put_xy_mmx (8, height, dest, ref, stride);
434 /*-----------------------------------------------------------------------*/
436 static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest,
437 const uint8_t * ref, const int stride)
439 const uint8_t * ref_next = ref + stride;
441 mmx_zero_reg ();
443 do {
444 mmx_interp_average_2_U8 (dest, ref, ref_next);
446 if (width == 16)
447 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
449 dest += stride;
450 ref += stride;
451 ref_next += stride;
452 } while (--height);
455 static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref,
456 int stride, int height)
458 MC_avg_y_mmx (16, height, dest, ref, stride);
461 static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref,
462 int stride, int height)
464 MC_avg_y_mmx (8, height, dest, ref, stride);
467 /*-----------------------------------------------------------------------*/
469 static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest,
470 const uint8_t * ref, const int stride)
472 const uint8_t * ref_next = ref + stride;
474 mmx_zero_reg ();
476 do {
477 mmx_average_2_U8 (dest, ref, ref_next);
479 if (width == 16)
480 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
482 dest += stride;
483 ref += stride;
484 ref_next += stride;
485 } while (--height);
488 static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref,
489 int stride, int height)
491 MC_put_y_mmx (16, height, dest, ref, stride);
494 static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref,
495 int stride, int height)
497 MC_put_y_mmx (8, height, dest, ref, stride);
501 MPEG2_MC_EXTERN (mmx)
509 /* CPU_MMXEXT/CPU_3DNOW adaptation layer */
511 #define pavg_r2r(src,dest) \
512 do { \
513 if (cpu == CPU_MMXEXT) \
514 pavgb_r2r (src, dest); \
515 else \
516 pavgusb_r2r (src, dest); \
517 } while (0)
519 #define pavg_m2r(src,dest) \
520 do { \
521 if (cpu == CPU_MMXEXT) \
522 pavgb_m2r (src, dest); \
523 else \
524 pavgusb_m2r (src, dest); \
525 } while (0)
528 /* CPU_MMXEXT code */
531 static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref,
532 const int stride)
534 do {
535 movq_m2r (*ref, mm0);
536 movq_r2m (mm0, *dest);
537 ref += stride;
538 dest += stride;
539 } while (--height);
542 static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref,
543 const int stride)
545 do {
546 movq_m2r (*ref, mm0);
547 movq_m2r (*(ref+8), mm1);
548 ref += stride;
549 movq_r2m (mm0, *dest);
550 movq_r2m (mm1, *(dest+8));
551 dest += stride;
552 } while (--height);
555 static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref,
556 const int stride, const int cpu)
558 do {
559 movq_m2r (*ref, mm0);
560 pavg_m2r (*dest, mm0);
561 ref += stride;
562 movq_r2m (mm0, *dest);
563 dest += stride;
564 } while (--height);
567 static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref,
568 const int stride, const int cpu)
570 do {
571 movq_m2r (*ref, mm0);
572 movq_m2r (*(ref+8), mm1);
573 pavg_m2r (*dest, mm0);
574 pavg_m2r (*(dest+8), mm1);
575 movq_r2m (mm0, *dest);
576 ref += stride;
577 movq_r2m (mm1, *(dest+8));
578 dest += stride;
579 } while (--height);
582 static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref,
583 const int stride, const int offset,
584 const int cpu)
586 do {
587 movq_m2r (*ref, mm0);
588 pavg_m2r (*(ref+offset), mm0);
589 ref += stride;
590 movq_r2m (mm0, *dest);
591 dest += stride;
592 } while (--height);
595 static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref,
596 const int stride, const int offset,
597 const int cpu)
599 do {
600 movq_m2r (*ref, mm0);
601 movq_m2r (*(ref+8), mm1);
602 pavg_m2r (*(ref+offset), mm0);
603 pavg_m2r (*(ref+offset+8), mm1);
604 movq_r2m (mm0, *dest);
605 ref += stride;
606 movq_r2m (mm1, *(dest+8));
607 dest += stride;
608 } while (--height);
611 static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref,
612 const int stride, const int offset,
613 const int cpu)
615 do {
616 movq_m2r (*ref, mm0);
617 pavg_m2r (*(ref+offset), mm0);
618 pavg_m2r (*dest, mm0);
619 ref += stride;
620 movq_r2m (mm0, *dest);
621 dest += stride;
622 } while (--height);
625 static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref,
626 const int stride, const int offset,
627 const int cpu)
629 do {
630 movq_m2r (*ref, mm0);
631 movq_m2r (*(ref+8), mm1);
632 pavg_m2r (*(ref+offset), mm0);
633 pavg_m2r (*(ref+offset+8), mm1);
634 pavg_m2r (*dest, mm0);
635 pavg_m2r (*(dest+8), mm1);
636 ref += stride;
637 movq_r2m (mm0, *dest);
638 movq_r2m (mm1, *(dest+8));
639 dest += stride;
640 } while (--height);
643 static mmx_t mask_one = {0x0101010101010101LL};
645 static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref,
646 const int stride, const int cpu)
648 movq_m2r (*ref, mm0);
649 movq_m2r (*(ref+1), mm1);
650 movq_r2r (mm0, mm7);
651 pxor_r2r (mm1, mm7);
652 pavg_r2r (mm1, mm0);
653 ref += stride;
655 do {
656 movq_m2r (*ref, mm2);
657 movq_r2r (mm0, mm5);
659 movq_m2r (*(ref+1), mm3);
660 movq_r2r (mm2, mm6);
662 pxor_r2r (mm3, mm6);
663 pavg_r2r (mm3, mm2);
665 por_r2r (mm6, mm7);
666 pxor_r2r (mm2, mm5);
668 pand_r2r (mm5, mm7);
669 pavg_r2r (mm2, mm0);
671 pand_m2r (mask_one, mm7);
673 psubusb_r2r (mm7, mm0);
675 ref += stride;
676 movq_r2m (mm0, *dest);
677 dest += stride;
679 movq_r2r (mm6, mm7); /* unroll ! */
680 movq_r2r (mm2, mm0); /* unroll ! */
681 } while (--height);
684 static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref,
685 const int stride, const int cpu)
687 do {
688 movq_m2r (*ref, mm0);
689 movq_m2r (*(ref+stride+1), mm1);
690 movq_r2r (mm0, mm7);
691 movq_m2r (*(ref+1), mm2);
692 pxor_r2r (mm1, mm7);
693 movq_m2r (*(ref+stride), mm3);
694 movq_r2r (mm2, mm6);
695 pxor_r2r (mm3, mm6);
696 pavg_r2r (mm1, mm0);
697 pavg_r2r (mm3, mm2);
698 por_r2r (mm6, mm7);
699 movq_r2r (mm0, mm6);
700 pxor_r2r (mm2, mm6);
701 pand_r2r (mm6, mm7);
702 pand_m2r (mask_one, mm7);
703 pavg_r2r (mm2, mm0);
704 psubusb_r2r (mm7, mm0);
705 movq_r2m (mm0, *dest);
707 movq_m2r (*(ref+8), mm0);
708 movq_m2r (*(ref+stride+9), mm1);
709 movq_r2r (mm0, mm7);
710 movq_m2r (*(ref+9), mm2);
711 pxor_r2r (mm1, mm7);
712 movq_m2r (*(ref+stride+8), mm3);
713 movq_r2r (mm2, mm6);
714 pxor_r2r (mm3, mm6);
715 pavg_r2r (mm1, mm0);
716 pavg_r2r (mm3, mm2);
717 por_r2r (mm6, mm7);
718 movq_r2r (mm0, mm6);
719 pxor_r2r (mm2, mm6);
720 pand_r2r (mm6, mm7);
721 pand_m2r (mask_one, mm7);
722 pavg_r2r (mm2, mm0);
723 psubusb_r2r (mm7, mm0);
724 ref += stride;
725 movq_r2m (mm0, *(dest+8));
726 dest += stride;
727 } while (--height);
730 static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref,
731 const int stride, const int cpu)
733 do {
734 movq_m2r (*ref, mm0);
735 movq_m2r (*(ref+stride+1), mm1);
736 movq_r2r (mm0, mm7);
737 movq_m2r (*(ref+1), mm2);
738 pxor_r2r (mm1, mm7);
739 movq_m2r (*(ref+stride), mm3);
740 movq_r2r (mm2, mm6);
741 pxor_r2r (mm3, mm6);
742 pavg_r2r (mm1, mm0);
743 pavg_r2r (mm3, mm2);
744 por_r2r (mm6, mm7);
745 movq_r2r (mm0, mm6);
746 pxor_r2r (mm2, mm6);
747 pand_r2r (mm6, mm7);
748 pand_m2r (mask_one, mm7);
749 pavg_r2r (mm2, mm0);
750 psubusb_r2r (mm7, mm0);
751 movq_m2r (*dest, mm1);
752 pavg_r2r (mm1, mm0);
753 ref += stride;
754 movq_r2m (mm0, *dest);
755 dest += stride;
756 } while (--height);
759 static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref,
760 const int stride, const int cpu)
762 do {
763 movq_m2r (*ref, mm0);
764 movq_m2r (*(ref+stride+1), mm1);
765 movq_r2r (mm0, mm7);
766 movq_m2r (*(ref+1), mm2);
767 pxor_r2r (mm1, mm7);
768 movq_m2r (*(ref+stride), mm3);
769 movq_r2r (mm2, mm6);
770 pxor_r2r (mm3, mm6);
771 pavg_r2r (mm1, mm0);
772 pavg_r2r (mm3, mm2);
773 por_r2r (mm6, mm7);
774 movq_r2r (mm0, mm6);
775 pxor_r2r (mm2, mm6);
776 pand_r2r (mm6, mm7);
777 pand_m2r (mask_one, mm7);
778 pavg_r2r (mm2, mm0);
779 psubusb_r2r (mm7, mm0);
780 movq_m2r (*dest, mm1);
781 pavg_r2r (mm1, mm0);
782 movq_r2m (mm0, *dest);
784 movq_m2r (*(ref+8), mm0);
785 movq_m2r (*(ref+stride+9), mm1);
786 movq_r2r (mm0, mm7);
787 movq_m2r (*(ref+9), mm2);
788 pxor_r2r (mm1, mm7);
789 movq_m2r (*(ref+stride+8), mm3);
790 movq_r2r (mm2, mm6);
791 pxor_r2r (mm3, mm6);
792 pavg_r2r (mm1, mm0);
793 pavg_r2r (mm3, mm2);
794 por_r2r (mm6, mm7);
795 movq_r2r (mm0, mm6);
796 pxor_r2r (mm2, mm6);
797 pand_r2r (mm6, mm7);
798 pand_m2r (mask_one, mm7);
799 pavg_r2r (mm2, mm0);
800 psubusb_r2r (mm7, mm0);
801 movq_m2r (*(dest+8), mm1);
802 pavg_r2r (mm1, mm0);
803 ref += stride;
804 movq_r2m (mm0, *(dest+8));
805 dest += stride;
806 } while (--height);
809 static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
810 int stride, int height)
812 MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT);
815 static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
816 int stride, int height)
818 MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT);
821 static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
822 int stride, int height)
824 MC_put1_16 (height, dest, ref, stride);
827 static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
828 int stride, int height)
830 MC_put1_8 (height, dest, ref, stride);
833 static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
834 int stride, int height)
836 MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
839 static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
840 int stride, int height)
842 MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
845 static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
846 int stride, int height)
848 MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
851 static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
852 int stride, int height)
854 MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
857 static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
858 int stride, int height)
860 MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
863 static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
864 int stride, int height)
866 MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
869 static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
870 int stride, int height)
872 MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
875 static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
876 int stride, int height)
878 MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
881 static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
882 int stride, int height)
884 MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT);
887 static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
888 int stride, int height)
890 MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT);
893 static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
894 int stride, int height)
896 MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT);
899 static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
900 int stride, int height)
902 MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT);
906 MPEG2_MC_EXTERN (mmxext)
910 static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
911 int stride, int height)
913 MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW);
916 static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
917 int stride, int height)
919 MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW);
922 static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
923 int stride, int height)
925 MC_put1_16 (height, dest, ref, stride);
928 static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
929 int stride, int height)
931 MC_put1_8 (height, dest, ref, stride);
934 static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
935 int stride, int height)
937 MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
940 static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
941 int stride, int height)
943 MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
946 static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
947 int stride, int height)
949 MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
952 static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
953 int stride, int height)
955 MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
958 static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
959 int stride, int height)
961 MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
964 static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
965 int stride, int height)
967 MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
970 static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
971 int stride, int height)
973 MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
976 static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
977 int stride, int height)
979 MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
982 static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
983 int stride, int height)
985 MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW);
988 static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
989 int stride, int height)
991 MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW);
994 static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
995 int stride, int height)
997 MC_put4_16 (height, dest, ref, stride, CPU_3DNOW);
1000 static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
1001 int stride, int height)
1003 MC_put4_8 (height, dest, ref, stride, CPU_3DNOW);
1007 MPEG2_MC_EXTERN (3dnow)
1009 #endif