20% faster hqdn3d on x86_64
[mplayer/glamo.git] / libmpcodecs / cmmx.h
blob6e0c8509f50e0914dd35aef8a75efef37e81aa21
1 /*
2 * x86 MMX and MMX2 packed byte operations in portable C.
3 * Extra instructions: pdiffub, pcmpzb, psumbw, pcmpgtub
4 * Author: Zoltan Hidvegi
5 */
7 #ifndef MPLAYER_CMMX_H
8 #define MPLAYER_CMMX_H
10 typedef unsigned long cmmx_t;
12 #define ONE_BYTES (~(cmmx_t)0 / 255)
13 #define SIGN_BITS (ONE_BYTES << 7)
14 #define LOWBW_MASK (~(cmmx_t)0 / 257)
16 static inline cmmx_t
17 paddb(cmmx_t a, cmmx_t b)
19 return ((a & ~SIGN_BITS) + (b & ~SIGN_BITS)) ^ ((a^b) & SIGN_BITS);
22 static inline cmmx_t
23 psubb(cmmx_t a, cmmx_t b)
25 return ((a | SIGN_BITS) - (b & ~SIGN_BITS)) ^ (~(a^b) & SIGN_BITS);
28 static inline cmmx_t
29 paddusb(cmmx_t a, cmmx_t b)
31 cmmx_t s = (a & ~SIGN_BITS) + (b & ~SIGN_BITS);
32 cmmx_t abs = (a | b) & SIGN_BITS;
33 cmmx_t c = abs & (s | (a & b));
34 return s | abs | (abs - (c >> 7));
37 static inline cmmx_t
38 paddusb_s(cmmx_t a, cmmx_t b)
40 cmmx_t sum = a+b;
41 cmmx_t ov = sum & SIGN_BITS;
42 return sum + (sum ^ (ov - (ov>>7)));
45 static inline cmmx_t
46 psubusb(cmmx_t a, cmmx_t b)
48 cmmx_t s = (a | SIGN_BITS) - (b & ~SIGN_BITS);
49 cmmx_t anb = a & ~b;
50 cmmx_t c = (anb | (s & ~(a^b))) & SIGN_BITS;
51 return s & ((c & anb) | (c - (c >> 7)));
54 static inline cmmx_t
55 psubusb_s(cmmx_t a, cmmx_t b)
57 cmmx_t d = (a|SIGN_BITS) - b;
58 cmmx_t m = d & SIGN_BITS;
59 return d & (m - (m>>7));
62 static inline cmmx_t
63 pcmpgtub(cmmx_t b, cmmx_t a)
65 cmmx_t s = (a | SIGN_BITS) - (b & ~SIGN_BITS);
66 cmmx_t ret = ((~a & b) | (~s & ~(a ^ b))) & SIGN_BITS;
67 return ret | (ret - (ret >> 7));
70 static inline cmmx_t
71 pdiffub(cmmx_t a, cmmx_t b)
73 cmmx_t xs = (~a ^ b) & SIGN_BITS;
74 cmmx_t s = ((a | SIGN_BITS) - (b & ~SIGN_BITS)) ^ xs;
75 cmmx_t gt = ((~a & b) | (s & xs)) & SIGN_BITS;
76 cmmx_t gt7 = gt >> 7;
77 return (s ^ gt ^ (gt - gt7)) + gt7;
80 static inline cmmx_t
81 pdiffub_s(cmmx_t a, cmmx_t b)
83 cmmx_t d = (a|SIGN_BITS) - b;
84 cmmx_t g = (~d & SIGN_BITS) >> 7;
85 return (d ^ (SIGN_BITS-g)) + g;
88 static inline cmmx_t
89 pmaxub(cmmx_t a, cmmx_t b)
91 return psubusb(a,b) + b;
94 static inline cmmx_t
95 pminub(cmmx_t a, cmmx_t b)
97 return paddusb(a,~b) - ~b;
100 static inline cmmx_t
101 pminub_s(cmmx_t a, cmmx_t b)
103 cmmx_t d = (a|SIGN_BITS) - b;
104 cmmx_t m = ~SIGN_BITS + ((d&SIGN_BITS)>>7);
105 return ((d&m) + b) & ~SIGN_BITS;
108 static inline cmmx_t
109 pavgb(cmmx_t a, cmmx_t b)
111 cmmx_t ao = a & ONE_BYTES;
112 cmmx_t bo = b & ONE_BYTES;
113 return ((a^ao)>>1) + ((b^bo)>>1) + (ao|bo);
116 static inline cmmx_t
117 pavgb_s(cmmx_t a, cmmx_t b)
119 return ((a+b+ONE_BYTES)>>1) & ~SIGN_BITS;
122 static inline cmmx_t
123 p31avgb(cmmx_t a, cmmx_t b)
125 cmmx_t ao = a & (3*ONE_BYTES);
126 cmmx_t bo = b & (3*ONE_BYTES);
127 return 3*((a^ao)>>2) + ((b^bo)>>2) +
128 (((3*ao+bo+2*ONE_BYTES)>>2) & (3*ONE_BYTES));
131 static inline cmmx_t
132 p31avgb_s(cmmx_t a, cmmx_t b)
134 cmmx_t avg = ((a+b)>>1) & ~SIGN_BITS;
135 return pavgb_s(avg, a);
138 static inline unsigned long
139 psumbw(cmmx_t a)
141 cmmx_t t = (a & LOWBW_MASK) + ((a>>8) & LOWBW_MASK);
142 unsigned long ret =
143 (unsigned long)t + (unsigned long)(t >> (4*sizeof(cmmx_t)));
144 if (sizeof(cmmx_t) > 4)
145 ret += ret >> 16;
146 return ret & 0xffff;
149 static inline unsigned long
150 psumbw_s(cmmx_t a)
152 unsigned long ret =
153 (unsigned long)a + (unsigned long)(a >> (4*sizeof(cmmx_t)));
154 if (sizeof(cmmx_t) <= 4)
155 return (ret & 0xff) + ((ret>>8) & 0xff);
156 ret = (ret & 0xff00ff) + ((ret>>8) & 0xff00ff);
157 ret += ret >> 16;
158 return ret & 0xffff;
161 static inline unsigned long
162 psadbw(cmmx_t a, cmmx_t b)
164 return psumbw(pdiffub(a,b));
167 static inline unsigned long
168 psadbw_s(cmmx_t a, cmmx_t b)
170 return psumbw_s(pdiffub_s(a,b));
173 static inline cmmx_t
174 pcmpzb(cmmx_t a)
176 cmmx_t ret = (((a | SIGN_BITS) - ONE_BYTES) | a) & SIGN_BITS;
177 return ~(ret | (ret - (ret >> 7)));
180 static inline cmmx_t
181 pcmpeqb(cmmx_t a, cmmx_t b)
183 return pcmpzb(a ^ b);
186 #endif /* MPLAYER_CMMX_H */