misc updates
[mplayer/glamo.git] / libmpeg2 / motion_comp_mmx.c
blobb550f8c7c99f3103d14a58eb85bb2e777f5bc8a4
1 /*
2 * motion_comp_mmx.c
3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
7 * See http://libmpeg2.sourceforge.net/ for updates.
9 * mpeg2dec is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * mpeg2dec is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 #include "config.h"
26 #if ARCH_X86 || ARCH_X86_64
28 #include <inttypes.h>
30 #include "mpeg2.h"
31 #include "attributes.h"
32 #include "mpeg2_internal.h"
33 #include "mmx.h"
35 #define CPU_MMXEXT 0
36 #define CPU_3DNOW 1
39 /* MMX code - needs a rewrite */
42 * Motion Compensation frequently needs to average values using the
43 * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction
44 * to compute this, but it's been left out of classic MMX.
46 * We need to be careful of overflows when doing this computation.
47 * Rather than unpacking data to 16-bits, which reduces parallelism,
48 * we use the following formulas:
50 * (x+y)>>1 == (x&y)+((x^y)>>1)
51 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
54 /* some rounding constants */
55 static mmx_t mask1 = {0xfefefefefefefefeLL};
56 static mmx_t round4 = {0x0002000200020002LL};
59 * This code should probably be compiled with loop unrolling
60 * (ie, -funroll-loops in gcc)becuase some of the loops
61 * use a small static number of iterations. This was written
62 * with the assumption the compiler knows best about when
63 * unrolling will help
66 static inline void mmx_zero_reg (void)
68 /* load 0 into mm0 */
69 pxor_r2r (mm0, mm0);
72 static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1,
73 const uint8_t * src2)
75 /* *dest = (*src1 + *src2 + 1)/ 2; */
77 movq_m2r (*src1, mm1); /* load 8 src1 bytes */
78 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */
80 movq_m2r (*src2, mm3); /* load 8 src2 bytes */
81 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */
83 pxor_r2r (mm1, mm3); /* xor src1 and src2 */
84 pand_m2r (mask1, mm3); /* mask lower bits */
85 psrlq_i2r (1, mm3); /* /2 */
86 por_r2r (mm2, mm4); /* or src1 and src2 */
87 psubb_r2r (mm3, mm4); /* subtract subresults */
88 movq_r2m (mm4, *dest); /* store result in dest */
91 static inline void mmx_interp_average_2_U8 (uint8_t * dest,
92 const uint8_t * src1,
93 const uint8_t * src2)
95 /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */
97 movq_m2r (*dest, mm1); /* load 8 dest bytes */
98 movq_r2r (mm1, mm2); /* copy 8 dest bytes */
100 movq_m2r (*src1, mm3); /* load 8 src1 bytes */
101 movq_r2r (mm3, mm4); /* copy 8 src1 bytes */
103 movq_m2r (*src2, mm5); /* load 8 src2 bytes */
104 movq_r2r (mm5, mm6); /* copy 8 src2 bytes */
106 pxor_r2r (mm3, mm5); /* xor src1 and src2 */
107 pand_m2r (mask1, mm5); /* mask lower bits */
108 psrlq_i2r (1, mm5); /* /2 */
109 por_r2r (mm4, mm6); /* or src1 and src2 */
110 psubb_r2r (mm5, mm6); /* subtract subresults */
111 movq_r2r (mm6, mm5); /* copy subresult */
113 pxor_r2r (mm1, mm5); /* xor srcavg and dest */
114 pand_m2r (mask1, mm5); /* mask lower bits */
115 psrlq_i2r (1, mm5); /* /2 */
116 por_r2r (mm2, mm6); /* or srcavg and dest */
117 psubb_r2r (mm5, mm6); /* subtract subresults */
118 movq_r2m (mm6, *dest); /* store result in dest */
121 static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
122 const uint8_t * src2,
123 const uint8_t * src3,
124 const uint8_t * src4)
126 /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */
128 movq_m2r (*src1, mm1); /* load 8 src1 bytes */
129 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */
131 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */
132 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */
134 movq_m2r (*src2, mm3); /* load 8 src2 bytes */
135 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */
137 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */
138 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */
140 paddw_r2r (mm3, mm1); /* add lows */
141 paddw_r2r (mm4, mm2); /* add highs */
143 /* now have partials in mm1 and mm2 */
145 movq_m2r (*src3, mm3); /* load 8 src3 bytes */
146 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */
148 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */
149 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */
151 paddw_r2r (mm3, mm1); /* add lows */
152 paddw_r2r (mm4, mm2); /* add highs */
154 movq_m2r (*src4, mm5); /* load 8 src4 bytes */
155 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */
157 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */
158 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */
160 paddw_r2r (mm5, mm1); /* add lows */
161 paddw_r2r (mm6, mm2); /* add highs */
163 /* now have subtotal in mm1 and mm2 */
165 paddw_m2r (round4, mm1);
166 psraw_i2r (2, mm1); /* /4 */
167 paddw_m2r (round4, mm2);
168 psraw_i2r (2, mm2); /* /4 */
170 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */
171 movq_r2m (mm1, *dest); /* store result in dest */
174 static inline void mmx_interp_average_4_U8 (uint8_t * dest,
175 const uint8_t * src1,
176 const uint8_t * src2,
177 const uint8_t * src3,
178 const uint8_t * src4)
180 /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */
182 movq_m2r (*src1, mm1); /* load 8 src1 bytes */
183 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */
185 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */
186 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */
188 movq_m2r (*src2, mm3); /* load 8 src2 bytes */
189 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */
191 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */
192 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */
194 paddw_r2r (mm3, mm1); /* add lows */
195 paddw_r2r (mm4, mm2); /* add highs */
197 /* now have partials in mm1 and mm2 */
199 movq_m2r (*src3, mm3); /* load 8 src3 bytes */
200 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */
202 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */
203 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */
205 paddw_r2r (mm3, mm1); /* add lows */
206 paddw_r2r (mm4, mm2); /* add highs */
208 movq_m2r (*src4, mm5); /* load 8 src4 bytes */
209 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */
211 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */
212 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */
214 paddw_r2r (mm5, mm1); /* add lows */
215 paddw_r2r (mm6, mm2); /* add highs */
217 paddw_m2r (round4, mm1);
218 psraw_i2r (2, mm1); /* /4 */
219 paddw_m2r (round4, mm2);
220 psraw_i2r (2, mm2); /* /4 */
222 /* now have subtotal/4 in mm1 and mm2 */
224 movq_m2r (*dest, mm3); /* load 8 dest bytes */
225 movq_r2r (mm3, mm4); /* copy 8 dest bytes */
227 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */
228 movq_r2r (mm1,mm2); /* copy subresult */
230 pxor_r2r (mm1, mm3); /* xor srcavg and dest */
231 pand_m2r (mask1, mm3); /* mask lower bits */
232 psrlq_i2r (1, mm3); /* /2 */
233 por_r2r (mm2, mm4); /* or srcavg and dest */
234 psubb_r2r (mm3, mm4); /* subtract subresults */
235 movq_r2m (mm4, *dest); /* store result in dest */
238 /*-----------------------------------------------------------------------*/
240 static inline void MC_avg_mmx (const int width, int height, uint8_t * dest,
241 const uint8_t * ref, const int stride)
243 mmx_zero_reg ();
245 do {
246 mmx_average_2_U8 (dest, dest, ref);
248 if (width == 16)
249 mmx_average_2_U8 (dest+8, dest+8, ref+8);
251 dest += stride;
252 ref += stride;
253 } while (--height);
256 static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref,
257 int stride, int height)
259 MC_avg_mmx (16, height, dest, ref, stride);
262 static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref,
263 int stride, int height)
265 MC_avg_mmx (8, height, dest, ref, stride);
268 /*-----------------------------------------------------------------------*/
270 static inline void MC_put_mmx (const int width, int height, uint8_t * dest,
271 const uint8_t * ref, const int stride)
273 mmx_zero_reg ();
275 do {
276 movq_m2r (* ref, mm1); /* load 8 ref bytes */
277 movq_r2m (mm1,* dest); /* store 8 bytes at curr */
279 if (width == 16)
281 movq_m2r (* (ref+8), mm1); /* load 8 ref bytes */
282 movq_r2m (mm1,* (dest+8)); /* store 8 bytes at curr */
285 dest += stride;
286 ref += stride;
287 } while (--height);
290 static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref,
291 int stride, int height)
293 MC_put_mmx (16, height, dest, ref, stride);
296 static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref,
297 int stride, int height)
299 MC_put_mmx (8, height, dest, ref, stride);
302 /*-----------------------------------------------------------------------*/
304 /* Half pixel interpolation in the x direction */
305 static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest,
306 const uint8_t * ref, const int stride)
308 mmx_zero_reg ();
310 do {
311 mmx_interp_average_2_U8 (dest, ref, ref+1);
313 if (width == 16)
314 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
316 dest += stride;
317 ref += stride;
318 } while (--height);
321 static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref,
322 int stride, int height)
324 MC_avg_x_mmx (16, height, dest, ref, stride);
327 static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref,
328 int stride, int height)
330 MC_avg_x_mmx (8, height, dest, ref, stride);
333 /*-----------------------------------------------------------------------*/
335 static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest,
336 const uint8_t * ref, const int stride)
338 mmx_zero_reg ();
340 do {
341 mmx_average_2_U8 (dest, ref, ref+1);
343 if (width == 16)
344 mmx_average_2_U8 (dest+8, ref+8, ref+9);
346 dest += stride;
347 ref += stride;
348 } while (--height);
351 static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref,
352 int stride, int height)
354 MC_put_x_mmx (16, height, dest, ref, stride);
357 static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref,
358 int stride, int height)
360 MC_put_x_mmx (8, height, dest, ref, stride);
363 /*-----------------------------------------------------------------------*/
365 static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest,
366 const uint8_t * ref, const int stride)
368 const uint8_t * ref_next = ref + stride;
370 mmx_zero_reg ();
372 do {
373 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
375 if (width == 16)
376 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
377 ref_next+8, ref_next+9);
379 dest += stride;
380 ref += stride;
381 ref_next += stride;
382 } while (--height);
385 static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
386 int stride, int height)
388 MC_avg_xy_mmx (16, height, dest, ref, stride);
391 static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
392 int stride, int height)
394 MC_avg_xy_mmx (8, height, dest, ref, stride);
397 /*-----------------------------------------------------------------------*/
399 static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest,
400 const uint8_t * ref, const int stride)
402 const uint8_t * ref_next = ref + stride;
404 mmx_zero_reg ();
406 do {
407 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
409 if (width == 16)
410 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
412 dest += stride;
413 ref += stride;
414 ref_next += stride;
415 } while (--height);
418 static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
419 int stride, int height)
421 MC_put_xy_mmx (16, height, dest, ref, stride);
424 static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
425 int stride, int height)
427 MC_put_xy_mmx (8, height, dest, ref, stride);
430 /*-----------------------------------------------------------------------*/
432 static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest,
433 const uint8_t * ref, const int stride)
435 const uint8_t * ref_next = ref + stride;
437 mmx_zero_reg ();
439 do {
440 mmx_interp_average_2_U8 (dest, ref, ref_next);
442 if (width == 16)
443 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
445 dest += stride;
446 ref += stride;
447 ref_next += stride;
448 } while (--height);
451 static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref,
452 int stride, int height)
454 MC_avg_y_mmx (16, height, dest, ref, stride);
457 static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref,
458 int stride, int height)
460 MC_avg_y_mmx (8, height, dest, ref, stride);
463 /*-----------------------------------------------------------------------*/
465 static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest,
466 const uint8_t * ref, const int stride)
468 const uint8_t * ref_next = ref + stride;
470 mmx_zero_reg ();
472 do {
473 mmx_average_2_U8 (dest, ref, ref_next);
475 if (width == 16)
476 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
478 dest += stride;
479 ref += stride;
480 ref_next += stride;
481 } while (--height);
484 static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref,
485 int stride, int height)
487 MC_put_y_mmx (16, height, dest, ref, stride);
490 static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref,
491 int stride, int height)
493 MC_put_y_mmx (8, height, dest, ref, stride);
497 MPEG2_MC_EXTERN (mmx)
505 /* CPU_MMXEXT/CPU_3DNOW adaptation layer */
507 #define pavg_r2r(src,dest) \
508 do { \
509 if (cpu == CPU_MMXEXT) \
510 pavgb_r2r (src, dest); \
511 else \
512 pavgusb_r2r (src, dest); \
513 } while (0)
515 #define pavg_m2r(src,dest) \
516 do { \
517 if (cpu == CPU_MMXEXT) \
518 pavgb_m2r (src, dest); \
519 else \
520 pavgusb_m2r (src, dest); \
521 } while (0)
524 /* CPU_MMXEXT code */
527 static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref,
528 const int stride)
530 do {
531 movq_m2r (*ref, mm0);
532 movq_r2m (mm0, *dest);
533 ref += stride;
534 dest += stride;
535 } while (--height);
538 static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref,
539 const int stride)
541 do {
542 movq_m2r (*ref, mm0);
543 movq_m2r (*(ref+8), mm1);
544 ref += stride;
545 movq_r2m (mm0, *dest);
546 movq_r2m (mm1, *(dest+8));
547 dest += stride;
548 } while (--height);
551 static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref,
552 const int stride, const int cpu)
554 do {
555 movq_m2r (*ref, mm0);
556 pavg_m2r (*dest, mm0);
557 ref += stride;
558 movq_r2m (mm0, *dest);
559 dest += stride;
560 } while (--height);
563 static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref,
564 const int stride, const int cpu)
566 do {
567 movq_m2r (*ref, mm0);
568 movq_m2r (*(ref+8), mm1);
569 pavg_m2r (*dest, mm0);
570 pavg_m2r (*(dest+8), mm1);
571 movq_r2m (mm0, *dest);
572 ref += stride;
573 movq_r2m (mm1, *(dest+8));
574 dest += stride;
575 } while (--height);
578 static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref,
579 const int stride, const int offset,
580 const int cpu)
582 do {
583 movq_m2r (*ref, mm0);
584 pavg_m2r (*(ref+offset), mm0);
585 ref += stride;
586 movq_r2m (mm0, *dest);
587 dest += stride;
588 } while (--height);
591 static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref,
592 const int stride, const int offset,
593 const int cpu)
595 do {
596 movq_m2r (*ref, mm0);
597 movq_m2r (*(ref+8), mm1);
598 pavg_m2r (*(ref+offset), mm0);
599 pavg_m2r (*(ref+offset+8), mm1);
600 movq_r2m (mm0, *dest);
601 ref += stride;
602 movq_r2m (mm1, *(dest+8));
603 dest += stride;
604 } while (--height);
607 static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref,
608 const int stride, const int offset,
609 const int cpu)
611 do {
612 movq_m2r (*ref, mm0);
613 pavg_m2r (*(ref+offset), mm0);
614 pavg_m2r (*dest, mm0);
615 ref += stride;
616 movq_r2m (mm0, *dest);
617 dest += stride;
618 } while (--height);
621 static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref,
622 const int stride, const int offset,
623 const int cpu)
625 do {
626 movq_m2r (*ref, mm0);
627 movq_m2r (*(ref+8), mm1);
628 pavg_m2r (*(ref+offset), mm0);
629 pavg_m2r (*(ref+offset+8), mm1);
630 pavg_m2r (*dest, mm0);
631 pavg_m2r (*(dest+8), mm1);
632 ref += stride;
633 movq_r2m (mm0, *dest);
634 movq_r2m (mm1, *(dest+8));
635 dest += stride;
636 } while (--height);
639 static mmx_t mask_one = {0x0101010101010101LL};
641 static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref,
642 const int stride, const int cpu)
644 movq_m2r (*ref, mm0);
645 movq_m2r (*(ref+1), mm1);
646 movq_r2r (mm0, mm7);
647 pxor_r2r (mm1, mm7);
648 pavg_r2r (mm1, mm0);
649 ref += stride;
651 do {
652 movq_m2r (*ref, mm2);
653 movq_r2r (mm0, mm5);
655 movq_m2r (*(ref+1), mm3);
656 movq_r2r (mm2, mm6);
658 pxor_r2r (mm3, mm6);
659 pavg_r2r (mm3, mm2);
661 por_r2r (mm6, mm7);
662 pxor_r2r (mm2, mm5);
664 pand_r2r (mm5, mm7);
665 pavg_r2r (mm2, mm0);
667 pand_m2r (mask_one, mm7);
669 psubusb_r2r (mm7, mm0);
671 ref += stride;
672 movq_r2m (mm0, *dest);
673 dest += stride;
675 movq_r2r (mm6, mm7); /* unroll ! */
676 movq_r2r (mm2, mm0); /* unroll ! */
677 } while (--height);
680 static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref,
681 const int stride, const int cpu)
683 do {
684 movq_m2r (*ref, mm0);
685 movq_m2r (*(ref+stride+1), mm1);
686 movq_r2r (mm0, mm7);
687 movq_m2r (*(ref+1), mm2);
688 pxor_r2r (mm1, mm7);
689 movq_m2r (*(ref+stride), mm3);
690 movq_r2r (mm2, mm6);
691 pxor_r2r (mm3, mm6);
692 pavg_r2r (mm1, mm0);
693 pavg_r2r (mm3, mm2);
694 por_r2r (mm6, mm7);
695 movq_r2r (mm0, mm6);
696 pxor_r2r (mm2, mm6);
697 pand_r2r (mm6, mm7);
698 pand_m2r (mask_one, mm7);
699 pavg_r2r (mm2, mm0);
700 psubusb_r2r (mm7, mm0);
701 movq_r2m (mm0, *dest);
703 movq_m2r (*(ref+8), mm0);
704 movq_m2r (*(ref+stride+9), mm1);
705 movq_r2r (mm0, mm7);
706 movq_m2r (*(ref+9), mm2);
707 pxor_r2r (mm1, mm7);
708 movq_m2r (*(ref+stride+8), mm3);
709 movq_r2r (mm2, mm6);
710 pxor_r2r (mm3, mm6);
711 pavg_r2r (mm1, mm0);
712 pavg_r2r (mm3, mm2);
713 por_r2r (mm6, mm7);
714 movq_r2r (mm0, mm6);
715 pxor_r2r (mm2, mm6);
716 pand_r2r (mm6, mm7);
717 pand_m2r (mask_one, mm7);
718 pavg_r2r (mm2, mm0);
719 psubusb_r2r (mm7, mm0);
720 ref += stride;
721 movq_r2m (mm0, *(dest+8));
722 dest += stride;
723 } while (--height);
726 static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref,
727 const int stride, const int cpu)
729 do {
730 movq_m2r (*ref, mm0);
731 movq_m2r (*(ref+stride+1), mm1);
732 movq_r2r (mm0, mm7);
733 movq_m2r (*(ref+1), mm2);
734 pxor_r2r (mm1, mm7);
735 movq_m2r (*(ref+stride), mm3);
736 movq_r2r (mm2, mm6);
737 pxor_r2r (mm3, mm6);
738 pavg_r2r (mm1, mm0);
739 pavg_r2r (mm3, mm2);
740 por_r2r (mm6, mm7);
741 movq_r2r (mm0, mm6);
742 pxor_r2r (mm2, mm6);
743 pand_r2r (mm6, mm7);
744 pand_m2r (mask_one, mm7);
745 pavg_r2r (mm2, mm0);
746 psubusb_r2r (mm7, mm0);
747 movq_m2r (*dest, mm1);
748 pavg_r2r (mm1, mm0);
749 ref += stride;
750 movq_r2m (mm0, *dest);
751 dest += stride;
752 } while (--height);
755 static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref,
756 const int stride, const int cpu)
758 do {
759 movq_m2r (*ref, mm0);
760 movq_m2r (*(ref+stride+1), mm1);
761 movq_r2r (mm0, mm7);
762 movq_m2r (*(ref+1), mm2);
763 pxor_r2r (mm1, mm7);
764 movq_m2r (*(ref+stride), mm3);
765 movq_r2r (mm2, mm6);
766 pxor_r2r (mm3, mm6);
767 pavg_r2r (mm1, mm0);
768 pavg_r2r (mm3, mm2);
769 por_r2r (mm6, mm7);
770 movq_r2r (mm0, mm6);
771 pxor_r2r (mm2, mm6);
772 pand_r2r (mm6, mm7);
773 pand_m2r (mask_one, mm7);
774 pavg_r2r (mm2, mm0);
775 psubusb_r2r (mm7, mm0);
776 movq_m2r (*dest, mm1);
777 pavg_r2r (mm1, mm0);
778 movq_r2m (mm0, *dest);
780 movq_m2r (*(ref+8), mm0);
781 movq_m2r (*(ref+stride+9), mm1);
782 movq_r2r (mm0, mm7);
783 movq_m2r (*(ref+9), mm2);
784 pxor_r2r (mm1, mm7);
785 movq_m2r (*(ref+stride+8), mm3);
786 movq_r2r (mm2, mm6);
787 pxor_r2r (mm3, mm6);
788 pavg_r2r (mm1, mm0);
789 pavg_r2r (mm3, mm2);
790 por_r2r (mm6, mm7);
791 movq_r2r (mm0, mm6);
792 pxor_r2r (mm2, mm6);
793 pand_r2r (mm6, mm7);
794 pand_m2r (mask_one, mm7);
795 pavg_r2r (mm2, mm0);
796 psubusb_r2r (mm7, mm0);
797 movq_m2r (*(dest+8), mm1);
798 pavg_r2r (mm1, mm0);
799 ref += stride;
800 movq_r2m (mm0, *(dest+8));
801 dest += stride;
802 } while (--height);
805 static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
806 int stride, int height)
808 MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT);
811 static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
812 int stride, int height)
814 MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT);
817 static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
818 int stride, int height)
820 MC_put1_16 (height, dest, ref, stride);
823 static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
824 int stride, int height)
826 MC_put1_8 (height, dest, ref, stride);
829 static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
830 int stride, int height)
832 MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
835 static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
836 int stride, int height)
838 MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
841 static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
842 int stride, int height)
844 MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
847 static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
848 int stride, int height)
850 MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
853 static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
854 int stride, int height)
856 MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
859 static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
860 int stride, int height)
862 MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
865 static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
866 int stride, int height)
868 MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
871 static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
872 int stride, int height)
874 MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
877 static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
878 int stride, int height)
880 MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT);
883 static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
884 int stride, int height)
886 MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT);
889 static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
890 int stride, int height)
892 MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT);
895 static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
896 int stride, int height)
898 MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT);
902 MPEG2_MC_EXTERN (mmxext)
906 static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
907 int stride, int height)
909 MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW);
912 static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
913 int stride, int height)
915 MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW);
918 static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
919 int stride, int height)
921 MC_put1_16 (height, dest, ref, stride);
924 static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
925 int stride, int height)
927 MC_put1_8 (height, dest, ref, stride);
930 static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
931 int stride, int height)
933 MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
936 static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
937 int stride, int height)
939 MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
942 static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
943 int stride, int height)
945 MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
948 static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
949 int stride, int height)
951 MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
954 static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
955 int stride, int height)
957 MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
960 static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
961 int stride, int height)
963 MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
966 static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
967 int stride, int height)
969 MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
972 static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
973 int stride, int height)
975 MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
978 static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
979 int stride, int height)
981 MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW);
984 static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
985 int stride, int height)
987 MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW);
990 static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
991 int stride, int height)
993 MC_put4_16 (height, dest, ref, stride, CPU_3DNOW);
996 static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
997 int stride, int height)
999 MC_put4_8 (height, dest, ref, stride, CPU_3DNOW);
1003 MPEG2_MC_EXTERN (3dnow)
1005 #endif