Old RC: "rate" is a float nowadays (fix #4088)
[vlc/asuraparaju-public.git] / modules / mmx / i420_rgb_mmx.h
blobcd1155b4ba230e0c6142938df501df0c0376b499
1 /*****************************************************************************
2 * transforms_yuvmmx.h: MMX YUV transformation assembly
3 *****************************************************************************
4 * Copyright (C) 1999-2007 the VideoLAN team
5 * $Id$
7 * Authors: Olie Lho <ollie@sis.com.tw>
8 * Gaƫl Hendryckx <jimmy@via.ecp.fr>
9 * Samuel Hocevar <sam@zoy.org>
10 * Damien Fouilleul <damienf@videolan.org>
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
25 *****************************************************************************/
27 #ifdef MODULE_NAME_IS_i420_rgb_mmx
29 /* hope these constant values are cache line aligned */
30 static const uint64_t mmx_80w = 0x0080008000800080ULL; /* Will be referenced as %4 in inline asm */
31 static const uint64_t mmx_10w = 0x1010101010101010ULL; /* -- as %5 */
32 static const uint64_t mmx_00ffw = 0x00ff00ff00ff00ffULL; /* -- as %6 */
33 static const uint64_t mmx_Y_coeff = 0x253f253f253f253fULL; /* -- as %7 */
35 static const uint64_t mmx_U_green = 0xf37df37df37df37dULL; /* -- as %8 */
36 static const uint64_t mmx_U_blue = 0x4093409340934093ULL; /* -- as %9 */
37 static const uint64_t mmx_V_red = 0x3312331233123312ULL; /* -- as %10 */
38 static const uint64_t mmx_V_green = 0xe5fce5fce5fce5fcULL; /* -- as %11 */
40 static const uint64_t mmx_mask_f8 = 0xf8f8f8f8f8f8f8f8ULL; /* -- as %12 */
41 static const uint64_t mmx_mask_fc = 0xfcfcfcfcfcfcfcfcULL; /* -- as %13 */
43 #if defined(CAN_COMPILE_MMX)
45 /* MMX assembly */
47 #define MMX_CALL(MMX_INSTRUCTIONS) \
48 do { \
49 __asm__ __volatile__( \
50 ".p2align 3 \n\t" \
51 MMX_INSTRUCTIONS \
52 : \
53 : "r" (p_y), "r" (p_u), \
54 "r" (p_v), "r" (p_buffer), \
55 "m" (mmx_80w), "m" (mmx_10w), \
56 "m" (mmx_00ffw), "m" (mmx_Y_coeff), \
57 "m" (mmx_U_green), "m" (mmx_U_blue), \
58 "m" (mmx_V_red), "m" (mmx_V_green), \
59 "m" (mmx_mask_f8), "m" (mmx_mask_fc) ); \
60 } while(0)
62 #define MMX_END __asm__ __volatile__ ( "emms" )
64 #define MMX_INIT_16 " \n\
65 movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
66 movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
67 pxor %%mm4, %%mm4 # zero mm4 \n\
68 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
71 #define MMX_INIT_16_GRAY " \n\
72 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
73 #movl $0, (%3) # cache preload for image \n\
76 #define MMX_INIT_32 " \n\
77 movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
78 movl $0, (%3) # cache preload for image \n\
79 movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
80 pxor %%mm4, %%mm4 # zero mm4 \n\
81 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
85 * Do the multiply part of the conversion for even and odd pixels,
86 * register usage:
87 * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
88 * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
89 * mm6 -> Y even, mm7 -> Y odd
92 #define MMX_YUV_MUL " \n\
93 # convert the chroma part \n\
94 punpcklbw %%mm4, %%mm0 # scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 \n\
95 punpcklbw %%mm4, %%mm1 # scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 \n\
96 psubsw %4, %%mm0 # Cb -= 128 \n\
97 psubsw %4, %%mm1 # Cr -= 128 \n\
98 psllw $3, %%mm0 # Promote precision \n\
99 psllw $3, %%mm1 # Promote precision \n\
100 movq %%mm0, %%mm2 # Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 \n\
101 movq %%mm1, %%mm3 # Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 \n\
102 pmulhw %8, %%mm2 # Mul Cb with green coeff -> Cb green \n\
103 pmulhw %11, %%mm3 # Mul Cr with green coeff -> Cr green \n\
104 pmulhw %9, %%mm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\
105 pmulhw %10, %%mm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\
106 paddsw %%mm3, %%mm2 # Cb green + Cr green -> Cgreen \n\
108 # convert the luma part \n\
109 psubusb %5, %%mm6 # Y -= 16 \n\
110 movq %%mm6, %%mm7 # Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
111 pand %6, %%mm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
112 psrlw $8, %%mm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\
113 psllw $3, %%mm6 # Promote precision \n\
114 psllw $3, %%mm7 # Promote precision \n\
115 pmulhw %7, %%mm6 # Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 \n\
116 pmulhw %7, %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
120 * Do the addition part of the conversion for even and odd pixels,
121 * register usage:
122 * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
123 * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
124 * mm6 -> Y even, mm7 -> Y odd
127 #define MMX_YUV_ADD " \n\
128 # Do horizontal and vertical scaling \n\
129 movq %%mm0, %%mm3 # Copy Cblue \n\
130 movq %%mm1, %%mm4 # Copy Cred \n\
131 movq %%mm2, %%mm5 # Copy Cgreen \n\
132 paddsw %%mm6, %%mm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\
133 paddsw %%mm7, %%mm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\
134 paddsw %%mm6, %%mm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\
135 paddsw %%mm7, %%mm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\
136 paddsw %%mm6, %%mm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\
137 paddsw %%mm7, %%mm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\
139 # Limit RGB even to 0..255 \n\
140 packuswb %%mm0, %%mm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\
141 packuswb %%mm1, %%mm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\
142 packuswb %%mm2, %%mm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\
144 # Limit RGB odd to 0..255 \n\
145 packuswb %%mm3, %%mm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\
146 packuswb %%mm4, %%mm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\
147 packuswb %%mm5, %%mm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\
149 # Interleave RGB even and odd \n\
150 punpcklbw %%mm3, %%mm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
151 punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
152 punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
156 * Grayscale case, only use Y
159 #define MMX_YUV_GRAY " \n\
160 # convert the luma part \n\
161 psubusb %5, %%mm6 \n\
162 movq %%mm6, %%mm7 \n\
163 pand %6, %%mm6 \n\
164 psrlw $8, %%mm7 \n\
165 psllw $3, %%mm6 \n\
166 psllw $3, %%mm7 \n\
167 pmulhw %7, %%mm6 \n\
168 pmulhw %7, %%mm7 \n\
169 packuswb %%mm6, %%mm6 \n\
170 packuswb %%mm7, %%mm7 \n\
171 punpcklbw %%mm7, %%mm6 \n\
174 #define MMX_UNPACK_16_GRAY " \n\
175 movq %%mm6, %%mm5 \n\
176 pand %12, %%mm6 \n\
177 pand %13, %%mm5 \n\
178 movq %%mm6, %%mm7 \n\
179 psrlw $3, %%mm7 \n\
180 pxor %%mm3, %%mm3 \n\
181 movq %%mm7, %%mm2 \n\
182 movq %%mm5, %%mm0 \n\
183 punpcklbw %%mm3, %%mm5 \n\
184 punpcklbw %%mm6, %%mm7 \n\
185 psllw $3, %%mm5 \n\
186 por %%mm5, %%mm7 \n\
187 movq %%mm7, (%3) \n\
188 punpckhbw %%mm3, %%mm0 \n\
189 punpckhbw %%mm6, %%mm2 \n\
190 psllw $3, %%mm0 \n\
191 movq 8(%0), %%mm6 \n\
192 por %%mm0, %%mm2 \n\
193 movq %%mm2, 8(%3) \n\
198 * convert RGB plane to RGB 15 bits,
199 * mm0 -> B, mm1 -> R, mm2 -> G,
200 * mm4 -> GB, mm5 -> AR pixel 4-7,
201 * mm6 -> GB, mm7 -> AR pixel 0-3
204 #define MMX_UNPACK_15 " \n\
205 # mask unneeded bits off \n\
206 pand %12, %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
207 psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
208 pand %12, %%mm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
209 pand %12, %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
210 psrlw $1,%%mm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
211 pxor %%mm4, %%mm4 # zero mm4 \n\
212 movq %%mm0, %%mm5 # Copy B7-B0 \n\
213 movq %%mm2, %%mm7 # Copy G7-G0 \n\
215 # convert rgb24 plane to rgb15 pack for pixel 0-3 \n\
216 punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3______ \n\
217 punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
218 psllw $2,%%mm2 # ________ ____g7g6 g5g4g3__ ________ \n\
219 por %%mm2, %%mm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
220 movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
221 movq %%mm0, (%3) # store pixel 0-3 \n\
223 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
224 punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3______ \n\
225 punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
226 psllw $2,%%mm7 # ________ ____g7g6 g5g4g3__ ________ \n\
227 movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
228 por %%mm7, %%mm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
229 movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
230 movq %%mm5, 8(%3) # store pixel 4-7 \n\
234 * convert RGB plane to RGB 16 bits,
235 * mm0 -> B, mm1 -> R, mm2 -> G,
236 * mm4 -> GB, mm5 -> AR pixel 4-7,
237 * mm6 -> GB, mm7 -> AR pixel 0-3
240 #define MMX_UNPACK_16 " \n\
241 # mask unneeded bits off \n\
242 pand %12, %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
243 pand %13, %%mm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
244 pand %12, %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
245 psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
246 pxor %%mm4, %%mm4 # zero mm4 \n\
247 movq %%mm0, %%mm5 # Copy B7-B0 \n\
248 movq %%mm2, %%mm7 # Copy G7-G0 \n\
250 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
251 punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3g2____ \n\
252 punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
253 psllw $3,%%mm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
254 por %%mm2, %%mm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
255 movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
256 movq %%mm0, (%3) # store pixel 0-3 \n\
258 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
259 punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3g2____ \n\
260 punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
261 psllw $3,%%mm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
262 movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
263 por %%mm7, %%mm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
264 movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
265 movq %%mm5, 8(%3) # store pixel 4-7 \n\
269 * convert RGB plane to RGB packed format,
270 * mm0 -> B, mm1 -> R, mm2 -> G
273 #define MMX_UNPACK_32_ARGB " \n\
274 pxor %%mm3, %%mm3 # zero mm3 \n\
275 movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
276 punpcklbw %%mm2, %%mm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
277 movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
278 punpcklbw %%mm3, %%mm5 # 00 R3 00 R2 00 R1 00 R0 \n\
279 movq %%mm4, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
280 punpcklwd %%mm5, %%mm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
281 movq %%mm4, (%3) # Store ARGB1 ARGB0 \n\
282 punpckhwd %%mm5, %%mm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
283 movq %%mm6, 8(%3) # Store ARGB3 ARGB2 \n\
284 punpckhbw %%mm2, %%mm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
285 punpckhbw %%mm3, %%mm1 # 00 R7 00 R6 00 R5 00 R4 \n\
286 movq %%mm0, %%mm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
287 punpcklwd %%mm1, %%mm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
288 movq %%mm5, 16(%3) # Store ARGB5 ARGB4 \n\
289 punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
290 movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\
293 #define MMX_UNPACK_32_RGBA " \n\
294 pxor %%mm3, %%mm3 # zero mm3 \n\
295 movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
296 punpcklbw %%mm1, %%mm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
297 punpcklbw %%mm0, %%mm3 # B3 00 B2 00 B1 00 B0 00 \n\
298 movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
299 punpcklwd %%mm4, %%mm3 # R1 G1 B1 00 R0 G0 B0 00 \n\
300 movq %%mm3, (%3) # Store RGBA1 RGBA0 \n\
301 punpckhwd %%mm4, %%mm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
302 movq %%mm5, 8(%3) # Store RGBA3 RGBA2 \n\
303 pxor %%mm6, %%mm6 # zero mm6 \n\
304 punpckhbw %%mm1, %%mm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
305 punpckhbw %%mm0, %%mm6 # B7 00 B6 00 B5 00 B4 00 \n\
306 movq %%mm6, %%mm0 # B7 00 B6 00 B5 00 B4 00 \n\
307 punpcklwd %%mm2, %%mm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
308 movq %%mm6, 16(%3) # Store RGBA5 RGBA4 \n\
309 punpckhwd %%mm2, %%mm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
310 movq %%mm0, 24(%3) # Store RGBA7 RGBA6 \n\
313 #define MMX_UNPACK_32_BGRA " \n\
314 pxor %%mm3, %%mm3 # zero mm3 \n\
315 movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
316 punpcklbw %%mm0, %%mm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
317 punpcklbw %%mm1, %%mm3 # R3 00 R2 00 R1 00 R0 00 \n\
318 movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
319 punpcklwd %%mm4, %%mm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
320 movq %%mm3, (%3) # Store BGRA1 BGRA0 \n\
321 punpckhwd %%mm4, %%mm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
322 movq %%mm5, 8(%3) # Store BGRA3 BGRA2 \n\
323 pxor %%mm6, %%mm6 # zero mm6 \n\
324 punpckhbw %%mm0, %%mm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
325 punpckhbw %%mm1, %%mm6 # R7 00 R6 00 R5 00 R4 00 \n\
326 movq %%mm6, %%mm0 # R7 00 R6 00 R5 00 R4 00 \n\
327 punpcklwd %%mm2, %%mm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
328 movq %%mm6, 16(%3) # Store BGRA5 BGRA4 \n\
329 punpckhwd %%mm2, %%mm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
330 movq %%mm0, 24(%3) # Store BGRA7 BGRA6 \n\
333 #define MMX_UNPACK_32_ABGR " \n\
334 pxor %%mm3, %%mm3 # zero mm3 \n\
335 movq %%mm1, %%mm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
336 punpcklbw %%mm2, %%mm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
337 movq %%mm0, %%mm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
338 punpcklbw %%mm3, %%mm5 # 00 B3 00 B2 00 B1 00 B0 \n\
339 movq %%mm4, %%mm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
340 punpcklwd %%mm5, %%mm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
341 movq %%mm4, (%3) # Store ABGR1 ABGR0 \n\
342 punpckhwd %%mm5, %%mm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
343 movq %%mm6, 8(%3) # Store ABGR3 ABGR2 \n\
344 punpckhbw %%mm2, %%mm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
345 punpckhbw %%mm3, %%mm0 # 00 B7 00 B6 00 B5 00 B4 \n\
346 movq %%mm1, %%mm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
347 punpcklwd %%mm0, %%mm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
348 movq %%mm1, 16(%3) # Store ABGR5 ABGR4 \n\
349 punpckhwd %%mm0, %%mm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
350 movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
353 #elif defined(HAVE_MMX_INTRINSICS)
355 /* MMX intrinsics */
357 #include <mmintrin.h>
359 #define MMX_CALL(MMX_INSTRUCTIONS) \
360 do { \
361 __m64 mm0, mm1, mm2, mm3, \
362 mm4, mm5, mm6, mm7; \
363 MMX_INSTRUCTIONS \
364 } while(0)
366 #define MMX_END _mm_empty()
368 #define MMX_INIT_16 \
369 mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
370 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
371 mm4 = _mm_setzero_si64(); \
372 mm6 = (__m64)*(uint64_t *)p_y;
374 #define MMX_INIT_32 \
375 mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
376 *(uint16_t *)p_buffer = 0; \
377 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
378 mm4 = _mm_setzero_si64(); \
379 mm6 = (__m64)*(uint64_t *)p_y;
381 #define MMX_YUV_MUL \
382 mm0 = _mm_unpacklo_pi8(mm0, mm4); \
383 mm1 = _mm_unpacklo_pi8(mm1, mm4); \
384 mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \
385 mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \
386 mm0 = _mm_slli_pi16(mm0, 3); \
387 mm1 = _mm_slli_pi16(mm1, 3); \
388 mm2 = mm0; \
389 mm3 = mm1; \
390 mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \
391 mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \
392 mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \
393 mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \
394 mm2 = _mm_adds_pi16(mm2, mm3); \
396 mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \
397 mm7 = mm6; \
398 mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \
399 mm7 = _mm_srli_pi16(mm7, 8); \
400 mm6 = _mm_slli_pi16(mm6, 3); \
401 mm7 = _mm_slli_pi16(mm7, 3); \
402 mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \
403 mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
405 #define MMX_YUV_ADD \
406 mm3 = mm0; \
407 mm4 = mm1; \
408 mm5 = mm2; \
409 mm0 = _mm_adds_pi16(mm0, mm6); \
410 mm3 = _mm_adds_pi16(mm3, mm7); \
411 mm1 = _mm_adds_pi16(mm1, mm6); \
412 mm4 = _mm_adds_pi16(mm4, mm7); \
413 mm2 = _mm_adds_pi16(mm2, mm6); \
414 mm5 = _mm_adds_pi16(mm5, mm7); \
416 mm0 = _mm_packs_pu16(mm0, mm0); \
417 mm1 = _mm_packs_pu16(mm1, mm1); \
418 mm2 = _mm_packs_pu16(mm2, mm2); \
420 mm3 = _mm_packs_pu16(mm3, mm3); \
421 mm4 = _mm_packs_pu16(mm4, mm4); \
422 mm5 = _mm_packs_pu16(mm5, mm5); \
424 mm0 = _mm_unpacklo_pi8(mm0, mm3); \
425 mm1 = _mm_unpacklo_pi8(mm1, mm4); \
426 mm2 = _mm_unpacklo_pi8(mm2, mm5);
428 #define MMX_UNPACK_15 \
429 mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
430 mm0 = _mm_srli_pi16(mm0, 3); \
431 mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \
432 mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
433 mm1 = _mm_srli_pi16(mm1, 1); \
434 mm4 = _mm_setzero_si64(); \
435 mm5 = mm0; \
436 mm7 = mm2; \
438 mm2 = _mm_unpacklo_pi8(mm2, mm4); \
439 mm0 = _mm_unpacklo_pi8(mm0, mm1); \
440 mm2 = _mm_slli_pi16(mm2, 2); \
441 mm0 = _mm_or_si64(mm0, mm2); \
442 mm6 = (__m64)*(uint64_t *)(p_y + 8); \
443 *(uint64_t *)p_buffer = (uint64_t)mm0; \
445 mm7 = _mm_unpackhi_pi8(mm7, mm4); \
446 mm5 = _mm_unpackhi_pi8(mm5, mm1); \
447 mm7 = _mm_slli_pi16(mm7, 2); \
448 mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
449 mm5 = _mm_or_si64(mm5, mm7); \
450 mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
451 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
453 #define MMX_UNPACK_16 \
454 mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
455 mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \
456 mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
457 mm0 = _mm_srli_pi16(mm0, 3); \
458 mm4 = _mm_setzero_si64(); \
459 mm5 = mm0; \
460 mm7 = mm2; \
462 mm2 = _mm_unpacklo_pi8(mm2, mm4); \
463 mm0 = _mm_unpacklo_pi8(mm0, mm1); \
464 mm2 = _mm_slli_pi16(mm2, 3); \
465 mm0 = _mm_or_si64(mm0, mm2); \
466 mm6 = (__m64)*(uint64_t *)(p_y + 8); \
467 *(uint64_t *)p_buffer = (uint64_t)mm0; \
469 mm7 = _mm_unpackhi_pi8(mm7, mm4); \
470 mm5 = _mm_unpackhi_pi8(mm5, mm1); \
471 mm7 = _mm_slli_pi16(mm7, 3); \
472 mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
473 mm5 = _mm_or_si64(mm5, mm7); \
474 mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
475 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
477 #define MMX_UNPACK_32_ARGB \
478 mm3 = _mm_setzero_si64(); \
479 mm4 = mm0; \
480 mm4 = _mm_unpacklo_pi8(mm4, mm2); \
481 mm5 = mm1; \
482 mm5 = _mm_unpacklo_pi8(mm5, mm3); \
483 mm6 = mm4; \
484 mm4 = _mm_unpacklo_pi16(mm4, mm5); \
485 *(uint64_t *)p_buffer = (uint64_t)mm4; \
486 mm6 = _mm_unpackhi_pi16(mm6, mm5); \
487 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
488 mm0 = _mm_unpackhi_pi8(mm0, mm2); \
489 mm1 = _mm_unpackhi_pi8(mm1, mm3); \
490 mm5 = mm0; \
491 mm5 = _mm_unpacklo_pi16(mm5, mm1); \
492 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;\
493 mm0 = _mm_unpackhi_pi16(mm0, mm1); \
494 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
496 #define MMX_UNPACK_32_RGBA \
497 mm3 = _mm_setzero_si64(); \
498 mm4 = mm2; \
499 mm4 = _mm_unpacklo_pi8(mm4, mm1); \
500 mm3 = _mm_unpacklo_pi8(mm3, mm0); \
501 mm5 = mm3; \
502 mm3 = _mm_unpacklo_pi16(mm3, mm4); \
503 *(uint64_t *)p_buffer = (uint64_t)mm3; \
504 mm5 = _mm_unpackhi_pi16(mm5, mm4); \
505 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
506 mm6 = _mm_setzero_si64(); \
507 mm2 = _mm_unpackhi_pi8(mm2, mm1); \
508 mm6 = _mm_unpackhi_pi8(mm6, mm0); \
509 mm0 = mm6; \
510 mm6 = _mm_unpacklo_pi16(mm6, mm2); \
511 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
512 mm0 = _mm_unpackhi_pi16(mm0, mm2); \
513 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
515 #define MMX_UNPACK_32_BGRA \
516 mm3 = _mm_setzero_si64(); \
517 mm4 = mm2; \
518 mm4 = _mm_unpacklo_pi8(mm4, mm0); \
519 mm3 = _mm_unpacklo_pi8(mm3, mm1); \
520 mm5 = mm3; \
521 mm3 = _mm_unpacklo_pi16(mm3, mm4); \
522 *(uint64_t *)p_buffer = (uint64_t)mm3; \
523 mm5 = _mm_unpackhi_pi16(mm5, mm4); \
524 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
525 mm6 = _mm_setzero_si64(); \
526 mm2 = _mm_unpackhi_pi8(mm2, mm0); \
527 mm6 = _mm_unpackhi_pi8(mm6, mm1); \
528 mm0 = mm6; \
529 mm6 = _mm_unpacklo_pi16(mm6, mm2); \
530 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
531 mm0 = _mm_unpackhi_pi16(mm0, mm2); \
532 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
534 #define MMX_UNPACK_32_ABGR \
535 mm3 = _mm_setzero_si64(); \
536 mm4 = mm1; \
537 mm4 = _mm_unpacklo_pi8(mm4, mm2); \
538 mm5 = mm0; \
539 mm5 = _mm_unpacklo_pi8(mm5, mm3); \
540 mm6 = mm4; \
541 mm4 = _mm_unpacklo_pi16(mm4, mm5); \
542 *(uint64_t *)p_buffer = (uint64_t)mm4; \
543 mm6 = _mm_unpackhi_pi16(mm6, mm5); \
544 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
545 mm1 = _mm_unpackhi_pi8(mm1, mm2); \
546 mm0 = _mm_unpackhi_pi8(mm0, mm3); \
547 mm2 = mm1; \
548 mm1 = _mm_unpacklo_pi16(mm1, mm0); \
549 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;\
550 mm2 = _mm_unpackhi_pi16(mm2, mm0); \
551 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm2;
553 #endif
555 #elif defined( MODULE_NAME_IS_i420_rgb_sse2 )
557 #if defined(CAN_COMPILE_SSE2)
559 /* SSE2 assembly */
561 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
562 do { \
563 __asm__ __volatile__( \
564 ".p2align 3 \n\t" \
565 SSE2_INSTRUCTIONS \
567 : "r" (p_y), "r" (p_u), \
568 "r" (p_v), "r" (p_buffer) \
569 : "eax" ); \
570 } while(0)
572 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
574 #define SSE2_INIT_16_ALIGNED " \n\
575 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
576 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
577 pxor %%xmm4, %%xmm4 # zero mm4 \n\
578 movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
581 #define SSE2_INIT_16_UNALIGNED " \n\
582 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
583 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
584 pxor %%xmm4, %%xmm4 # zero mm4 \n\
585 movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
586 prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
589 #define SSE2_INIT_32_ALIGNED " \n\
590 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
591 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
592 pxor %%xmm4, %%xmm4 # zero mm4 \n\
593 movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
596 #define SSE2_INIT_32_UNALIGNED " \n\
597 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
598 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
599 pxor %%xmm4, %%xmm4 # zero mm4 \n\
600 movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
601 prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
604 #define SSE2_YUV_MUL " \n\
605 # convert the chroma part \n\
606 punpcklbw %%xmm4, %%xmm0 # scatter 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
607 punpcklbw %%xmm4, %%xmm1 # scatter 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
608 movl $0x00800080, %%eax # \n\
609 movd %%eax, %%xmm5 # \n\
610 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 0080 0080 ... 0080 0080 \n\
611 psubsw %%xmm5, %%xmm0 # Cb -= 128 \n\
612 psubsw %%xmm5, %%xmm1 # Cr -= 128 \n\
613 psllw $3, %%xmm0 # Promote precision \n\
614 psllw $3, %%xmm1 # Promote precision \n\
615 movdqa %%xmm0, %%xmm2 # Copy 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
616 movdqa %%xmm1, %%xmm3 # Copy 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
617 movl $0xf37df37d, %%eax # \n\
618 movd %%eax, %%xmm5 # \n\
619 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to f37d f37d ... f37d f37d \n\
620 pmulhw %%xmm5, %%xmm2 # Mul Cb with green coeff -> Cb green \n\
621 movl $0xe5fce5fc, %%eax # \n\
622 movd %%eax, %%xmm5 # \n\
623 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to e5fc e5fc ... e5fc e5fc \n\
624 pmulhw %%xmm5, %%xmm3 # Mul Cr with green coeff -> Cr green \n\
625 movl $0x40934093, %%eax # \n\
626 movd %%eax, %%xmm5 # \n\
627 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 4093 4093 ... 4093 4093 \n\
628 pmulhw %%xmm5, %%xmm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\
629 movl $0x33123312, %%eax # \n\
630 movd %%eax, %%xmm5 # \n\
631 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 3312 3312 ... 3312 3312 \n\
632 pmulhw %%xmm5, %%xmm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\
633 paddsw %%xmm3, %%xmm2 # Cb green + Cr green -> Cgreen \n\
635 # convert the luma part \n\
636 movl $0x10101010, %%eax # \n\
637 movd %%eax, %%xmm5 # \n\
638 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 1010 1010 ... 1010 1010 \n\
639 psubusb %%xmm5, %%xmm6 # Y -= 16 \n\
640 movdqa %%xmm6, %%xmm7 # Copy 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
641 movl $0x00ff00ff, %%eax # \n\
642 movd %%eax, %%xmm5 # \n\
643 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 00ff 00ff ... 00ff 00ff \n\
644 pand %%xmm5, %%xmm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
645 psrlw $8, %%xmm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\
646 psllw $3, %%xmm6 # Promote precision \n\
647 psllw $3, %%xmm7 # Promote precision \n\
648 movl $0x253f253f, %%eax # \n\
649 movd %%eax, %%xmm5 # \n\
650 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 253f 253f ... 253f 253f \n\
651 pmulhw %%xmm5, %%xmm6 # Mul 8 Y even 00 y6 00 y4 00 y2 00 y0 \n\
652 pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
655 #define SSE2_YUV_ADD " \n\
656 # Do horizontal and vertical scaling \n\
657 movdqa %%xmm0, %%xmm3 # Copy Cblue \n\
658 movdqa %%xmm1, %%xmm4 # Copy Cred \n\
659 movdqa %%xmm2, %%xmm5 # Copy Cgreen \n\
660 paddsw %%xmm6, %%xmm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\
661 paddsw %%xmm7, %%xmm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\
662 paddsw %%xmm6, %%xmm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\
663 paddsw %%xmm7, %%xmm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\
664 paddsw %%xmm6, %%xmm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\
665 paddsw %%xmm7, %%xmm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\
667 # Limit RGB even to 0..255 \n\
668 packuswb %%xmm0, %%xmm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\
669 packuswb %%xmm1, %%xmm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\
670 packuswb %%xmm2, %%xmm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\
672 # Limit RGB odd to 0..255 \n\
673 packuswb %%xmm3, %%xmm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\
674 packuswb %%xmm4, %%xmm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\
675 packuswb %%xmm5, %%xmm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\
677 # Interleave RGB even and odd \n\
678 punpcklbw %%xmm3, %%xmm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
679 punpcklbw %%xmm4, %%xmm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
680 punpcklbw %%xmm5, %%xmm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
683 #define SSE2_UNPACK_15_ALIGNED " \n\
684 # mask unneeded bits off \n\
685 movl $0xf8f8f8f8, %%eax # \n\
686 movd %%eax, %%xmm5 # \n\
687 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
688 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
689 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
690 pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
691 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
692 psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
693 pxor %%xmm4, %%xmm4 # zero mm4 \n\
694 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
695 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
697 # convert rgb24 plane to rgb15 pack for pixel 0-7 \n\
698 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\
699 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
700 psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\
701 por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
702 movntdq %%xmm0, (%3) # store pixel 0-7 \n\
704 # convert rgb24 plane to rgb15 pack for pixel 8-15 \n\
705 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\
706 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
707 psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\
708 por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
709 movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\
712 #define SSE2_UNPACK_15_UNALIGNED " \n\
713 # mask unneeded bits off \n\
714 movl $0xf8f8f8f8, %%eax # \n\
715 movd %%eax, %%xmm5 # \n\
716 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
717 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
718 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
719 pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
720 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
721 psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
722 pxor %%xmm4, %%xmm4 # zero mm4 \n\
723 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
724 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
726 # convert rgb24 plane to rgb15 pack for pixel 0-7 \n\
727 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\
728 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
729 psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\
730 por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
731 movdqu %%xmm0, (%3) # store pixel 0-7 \n\
733 # convert rgb24 plane to rgb15 pack for pixel 8-15 \n\
734 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\
735 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
736 psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\
737 por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
738 movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
741 #define SSE2_UNPACK_16_ALIGNED " \n\
742 # mask unneeded bits off \n\
743 movl $0xf8f8f8f8, %%eax # \n\
744 movd %%eax, %%xmm5 # \n\
745 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
746 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
747 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
748 movl $0xfcfcfcfc, %%eax # \n\
749 movd %%eax, %%xmm5 # \n\
750 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
751 pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
752 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
753 pxor %%xmm4, %%xmm4 # zero mm4 \n\
754 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
755 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
757 # convert rgb24 plane to rgb16 pack for pixel 0-7 \n\
758 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
759 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
760 psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
761 por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
762 movntdq %%xmm0, (%3) # store pixel 0-7 \n\
764 # convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
765 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
766 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
767 psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
768 por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
769 movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\
772 #define SSE2_UNPACK_16_UNALIGNED " \n\
773 # mask unneeded bits off \n\
774 movl $0xf8f8f8f8, %%eax # \n\
775 movd %%eax, %%xmm5 # \n\
776 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
777 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
778 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
779 movl $0xfcfcfcfc, %%eax # \n\
780 movd %%eax, %%xmm5 # \n\
781 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
782 pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
783 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
784 pxor %%xmm4, %%xmm4 # zero mm4 \n\
785 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
786 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
788 # convert rgb24 plane to rgb16 pack for pixel 0-7 \n\
789 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
790 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
791 psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
792 por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
793 movdqu %%xmm0, (%3) # store pixel 0-7 \n\
795 # convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
796 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
797 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
798 psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
799 por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
800 movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
803 #define SSE2_UNPACK_32_ARGB_ALIGNED " \n\
804 pxor %%xmm3, %%xmm3 # zero xmm3 \n\
805 movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
806 punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
807 movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
808 punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
809 movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
810 punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
811 movntdq %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
812 punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
813 movntdq %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
814 punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
815 punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
816 movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
817 punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
818 movntdq %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
819 punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
820 movntdq %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
823 #define SSE2_UNPACK_32_ARGB_UNALIGNED " \n\
824 pxor %%xmm3, %%xmm3 # zero xmm3 \n\
825 movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
826 punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
827 movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
828 punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
829 movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
830 punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
831 movdqu %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
832 punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
833 movdqu %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
834 punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
835 punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
836 movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
837 punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
838 movdqu %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
839 punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
840 movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
843 #define SSE2_UNPACK_32_RGBA_ALIGNED " \n\
844 pxor %%xmm3, %%xmm3 # zero mm3 \n\
845 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
846 punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
847 punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
848 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
849 punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
850 movntdq %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
851 punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
852 movntdq %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
853 pxor %%xmm6, %%xmm6 # zero mm6 \n\
854 punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
855 punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
856 movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
857 punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
858 movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 RGBA8 \n\
859 punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
860 movntdq %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
863 #define SSE2_UNPACK_32_RGBA_UNALIGNED " \n\
864 pxor %%xmm3, %%xmm3 # zero mm3 \n\
865 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
866 punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
867 punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
868 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
869 punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
870 movdqu %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
871 punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
872 movdqu %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
873 pxor %%xmm6, %%xmm6 # zero mm6 \n\
874 punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
875 punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
876 movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
877 punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
878 movdqu %%xmm6, 32(%3) # Store RGBA11 RGBA10 RGBA9 RGBA8 \n\
879 punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
880 movdqu %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
883 #define SSE2_UNPACK_32_BGRA_ALIGNED " \n\
884 pxor %%xmm3, %%xmm3 # zero mm3 \n\
885 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
886 punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
887 punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
888 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
889 punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
890 movntdq %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
891 punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
892 movntdq %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
893 pxor %%xmm6, %%xmm6 # zero mm6 \n\
894 punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
895 punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\
896 movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\
897 punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
898 movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
899 punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
900 movntdq %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
903 #define SSE2_UNPACK_32_BGRA_UNALIGNED " \n\
904 pxor %%xmm3, %%xmm3 # zero mm3 \n\
905 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
906 punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
907 punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
908 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
909 punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
910 movdqu %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
911 punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
912 movdqu %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
913 pxor %%xmm6, %%xmm6 # zero mm6 \n\
914 punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
915 punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\
916 movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\
917 punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
918 movdqu %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
919 punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
920 movdqu %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
923 #define SSE2_UNPACK_32_ABGR_ALIGNED " \n\
924 pxor %%xmm3, %%xmm3 # zero mm3 \n\
925 movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
926 punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
927 movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
928 punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\
929 movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
930 punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
931 movntdq %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\
932 punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
933 movntdq %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\
934 punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
935 punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\
936 movdqa %%xmm1, %%xmm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
937 punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
938 movntdq %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\
939 punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
940 movntdq %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
943 #define SSE2_UNPACK_32_ABGR_UNALIGNED " \n\
944 pxor %%xmm3, %%xmm3 # zero mm3 \n\
945 movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
946 punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
947 movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
948 punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\
949 movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
950 punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
951 movdqu %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\
952 punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
953 movdqu %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\
954 punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
955 punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\
956 movdqa %%xmm1, %%xmm2 # R7 00 R6 00 R5 00 R4 00 \n\
957 punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
958 movdqu %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\
959 punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
960 movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
963 #elif defined(HAVE_SSE2_INTRINSICS)
965 /* SSE2 intrinsics */
967 #include <emmintrin.h>
969 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
970 do { \
971 __m128i xmm0, xmm1, xmm2, xmm3, \
972 xmm4, xmm5, xmm6, xmm7; \
973 SSE2_INSTRUCTIONS \
974 } while(0)
976 #define SSE2_END _mm_sfence()
978 #define SSE2_INIT_16_ALIGNED \
979 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
980 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
981 xmm4 = _mm_setzero_si128(); \
982 xmm6 = _mm_load_si128((__m128i *)p_y);
984 #define SSE2_INIT_16_UNALIGNED \
985 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
986 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
987 xmm4 = _mm_setzero_si128(); \
988 xmm6 = _mm_loadu_si128((__m128i *)p_y); \
989 _mm_prefetch(p_buffer, _MM_HINT_NTA);
991 #define SSE2_INIT_32_ALIGNED \
992 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
993 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
994 xmm4 = _mm_setzero_si128(); \
995 xmm6 = _mm_load_si128((__m128i *)p_y);
997 #define SSE2_INIT_32_UNALIGNED \
998 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
999 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
1000 xmm4 = _mm_setzero_si128(); \
1001 xmm6 = _mm_loadu_si128((__m128i *)p_y); \
1002 _mm_prefetch(p_buffer, _MM_HINT_NTA);
1004 #define SSE2_YUV_MUL \
1005 xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \
1006 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
1007 xmm5 = _mm_set1_epi32(0x00800080UL); \
1008 xmm0 = _mm_subs_epi16(xmm0, xmm5); \
1009 xmm1 = _mm_subs_epi16(xmm1, xmm5); \
1010 xmm0 = _mm_slli_epi16(xmm0, 3); \
1011 xmm1 = _mm_slli_epi16(xmm1, 3); \
1012 xmm2 = xmm0; \
1013 xmm3 = xmm1; \
1014 xmm5 = _mm_set1_epi32(0xf37df37dUL); \
1015 xmm2 = _mm_mulhi_epi16(xmm2, xmm5); \
1016 xmm5 = _mm_set1_epi32(0xe5fce5fcUL); \
1017 xmm3 = _mm_mulhi_epi16(xmm3, xmm5); \
1018 xmm5 = _mm_set1_epi32(0x40934093UL); \
1019 xmm0 = _mm_mulhi_epi16(xmm0, xmm5); \
1020 xmm5 = _mm_set1_epi32(0x33123312UL); \
1021 xmm1 = _mm_mulhi_epi16(xmm1, xmm5); \
1022 xmm2 = _mm_adds_epi16(xmm2, xmm3); \
1024 xmm5 = _mm_set1_epi32(0x10101010UL); \
1025 xmm6 = _mm_subs_epu8(xmm6, xmm5); \
1026 xmm7 = xmm6; \
1027 xmm5 = _mm_set1_epi32(0x00ff00ffUL); \
1028 xmm6 = _mm_and_si128(xmm6, xmm5); \
1029 xmm7 = _mm_srli_epi16(xmm7, 8); \
1030 xmm6 = _mm_slli_epi16(xmm6, 3); \
1031 xmm7 = _mm_slli_epi16(xmm7, 3); \
1032 xmm5 = _mm_set1_epi32(0x253f253fUL); \
1033 xmm6 = _mm_mulhi_epi16(xmm6, xmm5); \
1034 xmm7 = _mm_mulhi_epi16(xmm7, xmm5);
1036 #define SSE2_YUV_ADD \
1037 xmm3 = xmm0; \
1038 xmm4 = xmm1; \
1039 xmm5 = xmm2; \
1040 xmm0 = _mm_adds_epi16(xmm0, xmm6); \
1041 xmm3 = _mm_adds_epi16(xmm3, xmm7); \
1042 xmm1 = _mm_adds_epi16(xmm1, xmm6); \
1043 xmm4 = _mm_adds_epi16(xmm4, xmm7); \
1044 xmm2 = _mm_adds_epi16(xmm2, xmm6); \
1045 xmm5 = _mm_adds_epi16(xmm5, xmm7); \
1047 xmm0 = _mm_packus_epi16(xmm0, xmm0); \
1048 xmm1 = _mm_packus_epi16(xmm1, xmm1); \
1049 xmm2 = _mm_packus_epi16(xmm2, xmm2); \
1051 xmm3 = _mm_packus_epi16(xmm3, xmm3); \
1052 xmm4 = _mm_packus_epi16(xmm4, xmm4); \
1053 xmm5 = _mm_packus_epi16(xmm5, xmm5); \
1055 xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \
1056 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
1057 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
1059 #define SSE2_UNPACK_15_ALIGNED \
1060 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
1061 xmm0 = _mm_and_si128(xmm0, xmm5); \
1062 xmm0 = _mm_srli_epi16(xmm0, 3); \
1063 xmm2 = _mm_and_si128(xmm2, xmm5); \
1064 xmm1 = _mm_and_si128(xmm1, xmm5); \
1065 xmm1 = _mm_srli_epi16(xmm1, 1); \
1066 xmm4 = _mm_setzero_si128(); \
1067 xmm5 = xmm0; \
1068 xmm7 = xmm2; \
1070 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
1071 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
1072 xmm2 = _mm_slli_epi16(xmm2, 2); \
1073 xmm0 = _mm_or_si128(xmm0, xmm2); \
1074 _mm_stream_si128((__m128i*)p_buffer, xmm0); \
1076 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
1077 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
1078 xmm7 = _mm_slli_epi16(xmm7, 2); \
1079 xmm5 = _mm_or_si128(xmm5, xmm7); \
1080 _mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
1082 #define SSE2_UNPACK_15_UNALIGNED \
1083 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
1084 xmm0 = _mm_and_si128(xmm0, xmm5); \
1085 xmm0 = _mm_srli_epi16(xmm0, 3); \
1086 xmm2 = _mm_and_si128(xmm2, xmm5); \
1087 xmm1 = _mm_and_si128(xmm1, xmm5); \
1088 xmm1 = _mm_srli_epi16(xmm1, 1); \
1089 xmm4 = _mm_setzero_si128(); \
1090 xmm5 = xmm0; \
1091 xmm7 = xmm2; \
1093 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
1094 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
1095 xmm2 = _mm_slli_epi16(xmm2, 2); \
1096 xmm0 = _mm_or_si128(xmm0, xmm2); \
1097 _mm_storeu_si128((__m128i*)p_buffer, xmm0); \
1099 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
1100 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
1101 xmm7 = _mm_slli_epi16(xmm7, 2); \
1102 xmm5 = _mm_or_si128(xmm5, xmm7); \
1103 _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
1105 #define SSE2_UNPACK_16_ALIGNED \
1106 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
1107 xmm0 = _mm_and_si128(xmm0, xmm5); \
1108 xmm1 = _mm_and_si128(xmm1, xmm5); \
1109 xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
1110 xmm2 = _mm_and_si128(xmm2, xmm5); \
1111 xmm0 = _mm_srli_epi16(xmm0, 3); \
1112 xmm4 = _mm_setzero_si128(); \
1113 xmm5 = xmm0; \
1114 xmm7 = xmm2; \
1116 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
1117 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
1118 xmm2 = _mm_slli_epi16(xmm2, 3); \
1119 xmm0 = _mm_or_si128(xmm0, xmm2); \
1120 _mm_stream_si128((__m128i*)p_buffer, xmm0); \
1122 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
1123 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
1124 xmm7 = _mm_slli_epi16(xmm7, 3); \
1125 xmm5 = _mm_or_si128(xmm5, xmm7); \
1126 _mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
1128 #define SSE2_UNPACK_16_UNALIGNED \
1129 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
1130 xmm0 = _mm_and_si128(xmm0, xmm5); \
1131 xmm1 = _mm_and_si128(xmm1, xmm5); \
1132 xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
1133 xmm2 = _mm_and_si128(xmm2, xmm5); \
1134 xmm0 = _mm_srli_epi16(xmm0, 3); \
1135 xmm4 = _mm_setzero_si128(); \
1136 xmm5 = xmm0; \
1137 xmm7 = xmm2; \
1139 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
1140 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
1141 xmm2 = _mm_slli_epi16(xmm2, 3); \
1142 xmm0 = _mm_or_si128(xmm0, xmm2); \
1143 _mm_storeu_si128((__m128i*)p_buffer, xmm0); \
1145 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
1146 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
1147 xmm7 = _mm_slli_epi16(xmm7, 3); \
1148 xmm5 = _mm_or_si128(xmm5, xmm7); \
1149 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
1151 #define SSE2_UNPACK_32_ARGB_ALIGNED \
1152 xmm3 = _mm_setzero_si128(); \
1153 xmm4 = xmm0; \
1154 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
1155 xmm5 = xmm1; \
1156 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
1157 xmm6 = xmm4; \
1158 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
1159 _mm_stream_si128((__m128i*)(p_buffer), xmm4); \
1160 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
1161 _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
1162 xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
1163 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
1164 xmm5 = xmm0; \
1165 xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
1166 _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \
1167 xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
1168 _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
1170 #define SSE2_UNPACK_32_ARGB_UNALIGNED \
1171 xmm3 = _mm_setzero_si128(); \
1172 xmm4 = xmm0; \
1173 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
1174 xmm5 = xmm1; \
1175 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
1176 xmm6 = xmm4; \
1177 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
1178 _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
1179 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
1180 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
1181 xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
1182 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
1183 xmm5 = xmm0; \
1184 xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
1185 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \
1186 xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
1187 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
1189 #define SSE2_UNPACK_32_RGBA_ALIGNED \
1190 xmm3 = _mm_setzero_si128(); \
1191 xmm4 = xmm2; \
1192 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
1193 xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \
1194 xmm5 = xmm3; \
1195 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
1196 _mm_stream_si128((__m128i*)(p_buffer), xmm3); \
1197 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
1198 _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
1199 xmm6 = _mm_setzero_si128(); \
1200 xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \
1201 xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \
1202 xmm0 = xmm6; \
1203 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
1204 _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
1205 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
1206 _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
1208 #define SSE2_UNPACK_32_RGBA_UNALIGNED \
1209 xmm3 = _mm_setzero_si128(); \
1210 xmm4 = xmm2; \
1211 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
1212 xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \
1213 xmm5 = xmm3; \
1214 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
1215 _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
1216 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
1217 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
1218 xmm6 = _mm_setzero_si128(); \
1219 xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \
1220 xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \
1221 xmm0 = xmm6; \
1222 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
1223 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
1224 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
1225 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
1227 #define SSE2_UNPACK_32_BGRA_ALIGNED \
1228 xmm3 = _mm_setzero_si128(); \
1229 xmm4 = xmm2; \
1230 xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
1231 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
1232 xmm5 = xmm3; \
1233 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
1234 _mm_stream_si128((__m128i*)(p_buffer), xmm3); \
1235 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
1236 _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
1237 xmm6 = _mm_setzero_si128(); \
1238 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
1239 xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
1240 xmm0 = xmm6; \
1241 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
1242 _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
1243 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
1244 _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
1246 #define SSE2_UNPACK_32_BGRA_UNALIGNED \
1247 xmm3 = _mm_setzero_si128(); \
1248 xmm4 = xmm2; \
1249 xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
1250 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
1251 xmm5 = xmm3; \
1252 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
1253 _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
1254 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
1255 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
1256 xmm6 = _mm_setzero_si128(); \
1257 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
1258 xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
1259 xmm0 = xmm6; \
1260 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
1261 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
1262 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
1263 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
1265 #define SSE2_UNPACK_32_ABGR_ALIGNED \
1266 xmm3 = _mm_setzero_si128(); \
1267 xmm4 = xmm1; \
1268 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
1269 xmm5 = xmm0; \
1270 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
1271 xmm6 = xmm4; \
1272 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
1273 _mm_stream_si128((__m128i*)(p_buffer), xmm4); \
1274 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
1275 _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
1276 xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
1277 xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
1278 xmm2 = xmm1; \
1279 xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
1280 _mm_stream_si128((__m128i*)(p_buffer+8), xmm1); \
1281 xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
1282 _mm_stream_si128((__m128i*)(p_buffer+12), xmm2);
1284 #define SSE2_UNPACK_32_ABGR_UNALIGNED \
1285 xmm3 = _mm_setzero_si128(); \
1286 xmm4 = xmm1; \
1287 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
1288 xmm5 = xmm0; \
1289 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
1290 xmm6 = xmm4; \
1291 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
1292 _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
1293 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
1294 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
1295 xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
1296 xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
1297 xmm2 = xmm1; \
1298 xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
1299 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm1); \
1300 xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
1301 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm2);
1303 #endif
1305 #endif