2 * x86 MMX and MMX2 packed byte operations in portable C.
3 * Extra instructions: pdiffub, pcmpzb, psumbw, pcmpgtub
4 * Author: Zoltan Hidvegi
10 typedef unsigned long cmmx_t
;
12 #define ONE_BYTES (~(cmmx_t)0 / 255)
13 #define SIGN_BITS (ONE_BYTES << 7)
14 #define LOWBW_MASK (~(cmmx_t)0 / 257)
17 paddb(cmmx_t a
, cmmx_t b
)
19 return ((a
& ~SIGN_BITS
) + (b
& ~SIGN_BITS
)) ^ ((a
^b
) & SIGN_BITS
);
23 psubb(cmmx_t a
, cmmx_t b
)
25 return ((a
| SIGN_BITS
) - (b
& ~SIGN_BITS
)) ^ (~(a
^b
) & SIGN_BITS
);
29 paddusb(cmmx_t a
, cmmx_t b
)
31 cmmx_t s
= (a
& ~SIGN_BITS
) + (b
& ~SIGN_BITS
);
32 cmmx_t abs
= (a
| b
) & SIGN_BITS
;
33 cmmx_t c
= abs
& (s
| (a
& b
));
34 return s
| abs
| (abs
- (c
>> 7));
38 paddusb_s(cmmx_t a
, cmmx_t b
)
41 cmmx_t ov
= sum
& SIGN_BITS
;
42 return sum
+ (sum
^ (ov
- (ov
>>7)));
46 psubusb(cmmx_t a
, cmmx_t b
)
48 cmmx_t s
= (a
| SIGN_BITS
) - (b
& ~SIGN_BITS
);
50 cmmx_t c
= (anb
| (s
& ~(a
^b
))) & SIGN_BITS
;
51 return s
& ((c
& anb
) | (c
- (c
>> 7)));
55 psubusb_s(cmmx_t a
, cmmx_t b
)
57 cmmx_t d
= (a
|SIGN_BITS
) - b
;
58 cmmx_t m
= d
& SIGN_BITS
;
59 return d
& (m
- (m
>>7));
63 pcmpgtub(cmmx_t b
, cmmx_t a
)
65 cmmx_t s
= (a
| SIGN_BITS
) - (b
& ~SIGN_BITS
);
66 cmmx_t ret
= ((~a
& b
) | (~s
& ~(a
^ b
))) & SIGN_BITS
;
67 return ret
| (ret
- (ret
>> 7));
71 pdiffub(cmmx_t a
, cmmx_t b
)
73 cmmx_t xs
= (~a
^ b
) & SIGN_BITS
;
74 cmmx_t s
= ((a
| SIGN_BITS
) - (b
& ~SIGN_BITS
)) ^ xs
;
75 cmmx_t gt
= ((~a
& b
) | (s
& xs
)) & SIGN_BITS
;
77 return (s
^ gt
^ (gt
- gt7
)) + gt7
;
81 pdiffub_s(cmmx_t a
, cmmx_t b
)
83 cmmx_t d
= (a
|SIGN_BITS
) - b
;
84 cmmx_t g
= (~d
& SIGN_BITS
) >> 7;
85 return (d
^ (SIGN_BITS
-g
)) + g
;
89 pmaxub(cmmx_t a
, cmmx_t b
)
91 return psubusb(a
,b
) + b
;
95 pminub(cmmx_t a
, cmmx_t b
)
97 return paddusb(a
,~b
) - ~b
;
101 pminub_s(cmmx_t a
, cmmx_t b
)
103 cmmx_t d
= (a
|SIGN_BITS
) - b
;
104 cmmx_t m
= ~SIGN_BITS
+ ((d
&SIGN_BITS
)>>7);
105 return ((d
&m
) + b
) & ~SIGN_BITS
;
109 pavgb(cmmx_t a
, cmmx_t b
)
111 cmmx_t ao
= a
& ONE_BYTES
;
112 cmmx_t bo
= b
& ONE_BYTES
;
113 return ((a
^ao
)>>1) + ((b
^bo
)>>1) + (ao
|bo
);
117 pavgb_s(cmmx_t a
, cmmx_t b
)
119 return ((a
+b
+ONE_BYTES
)>>1) & ~SIGN_BITS
;
123 p31avgb(cmmx_t a
, cmmx_t b
)
125 cmmx_t ao
= a
& (3*ONE_BYTES
);
126 cmmx_t bo
= b
& (3*ONE_BYTES
);
127 return 3*((a
^ao
)>>2) + ((b
^bo
)>>2) +
128 (((3*ao
+bo
+2*ONE_BYTES
)>>2) & (3*ONE_BYTES
));
132 p31avgb_s(cmmx_t a
, cmmx_t b
)
134 cmmx_t avg
= ((a
+b
)>>1) & ~SIGN_BITS
;
135 return pavgb_s(avg
, a
);
138 static inline unsigned long
141 cmmx_t t
= (a
& LOWBW_MASK
) + ((a
>>8) & LOWBW_MASK
);
143 (unsigned long)t
+ (unsigned long)(t
>> (4*sizeof(cmmx_t
)));
144 if (sizeof(cmmx_t
) > 4)
149 static inline unsigned long
153 (unsigned long)a
+ (unsigned long)(a
>> (4*sizeof(cmmx_t
)));
154 if (sizeof(cmmx_t
) <= 4)
155 return (ret
& 0xff) + ((ret
>>8) & 0xff);
156 ret
= (ret
& 0xff00ff) + ((ret
>>8) & 0xff00ff);
161 static inline unsigned long
162 psadbw(cmmx_t a
, cmmx_t b
)
164 return psumbw(pdiffub(a
,b
));
167 static inline unsigned long
168 psadbw_s(cmmx_t a
, cmmx_t b
)
170 return psumbw_s(pdiffub_s(a
,b
));
176 cmmx_t ret
= (((a
| SIGN_BITS
) - ONE_BYTES
) | a
) & SIGN_BITS
;
177 return ~(ret
| (ret
- (ret
>> 7)));
181 pcmpeqb(cmmx_t a
, cmmx_t b
)
183 return pcmpzb(a
^ b
);