Fix for SIMD YUV444 to RGB conversion
[liboggplay.git] / src / liboggplay / x86 / yuv2rgb_x86_vs.h
blob59da315785582d58397d7ffe77f68ce57678156e
1 #ifndef __OGGPLAY_YUV2RGB_VS_H__
2 #define __OGGPLAY_YUV2RGB_VS_H__
4 #define ATTR_ALIGN(_align) __declspec(align(_align))
6 #define emms() __asm emms
7 #define MMX_MOVNTQ movq
8 #define SSE_MOVNTQ movntq
9 #define SSE2_MOVNTQ movdqu
11 #if defined(_M_IX86)
12 #define LOAD_YUV(mov_instr, mov_half, reg_type) \
13 __asm { \
14 __asm mov eax, py \
15 __asm mov edx, pu \
16 __asm mov_instr reg_type##6, [eax] \
17 __asm mov_half reg_type##0, [edx] \
18 __asm mov eax, pv \
19 __asm mov_half reg_type##1, [eax] \
20 __asm pxor reg_type##4, reg_type##4 \
23 #define LOAD_YUV444(mov_instr, reg_type) \
24 __asm { \
25 __asm mov eax, py \
26 __asm mov edx, pu \
27 __asm mov_instr reg_type##6, [eax] \
28 __asm mov_instr reg_type##0, [edx] \
29 __asm mov eax, pv \
30 __asm mov_instr reg_type##1, [eax] \
31 __asm pxor reg_type##4, reg_type##4 \
34 #elif defined(_M_AMD64)
35 #define LOAD_YUV(mov_instr, mov_half, reg_type) \
36 { \
37 xmm6 = _mm_loadu_si128((__m128i*)py); \
38 xmm0 = _mm_loadl_epi64((__m128i*)pu); \
39 xmm1 = _mm_loadl_epi64((__m128i*)pv); \
40 xmm4 = _mm_setzero_si128(); \
43 #define LOAD_YUV444(mov_instr, reg_type) \
44 { \
45 xmm6 = _mm_loadu_si128((__m128i*)py); \
46 xmm0 = _mm_loadu_si128((__m128i*)pu); \
47 xmm1 = _mm_loadu_si128((__m128i*)pv); \
48 xmm4 = _mm_setzero_si128(); \
50 #endif
52 #if defined(_M_IX86)
53 #define OUTPUT_RGBA_32(mov_instr, reg_type, offset0, offset1, offset2) \
54 __asm { \
55 __asm mov eax, dst \
56 __asm mov_instr reg_type##3, [simd_table+128] \
57 __asm mov_instr reg_type##4, reg_type##1 \
58 __asm mov_instr reg_type##5, reg_type##0 \
59 __asm punpcklbw reg_type##1, reg_type##2 \
60 __asm punpcklbw reg_type##0, reg_type##3 \
61 __asm punpckhbw reg_type##4, reg_type##2 \
62 __asm punpckhbw reg_type##5, reg_type##3 \
63 __asm mov_instr reg_type##6, reg_type##1 \
64 __asm mov_instr reg_type##7, reg_type##4 \
65 __asm punpcklwd reg_type##1, reg_type##0 \
66 __asm punpckhwd reg_type##6, reg_type##0 \
67 __asm punpcklwd reg_type##4, reg_type##5 \
68 __asm punpckhwd reg_type##7, reg_type##5 \
69 __asm MOVNTQ [eax], reg_type##1 \
70 __asm MOVNTQ [eax+offset0], reg_type##6 \
71 __asm MOVNTQ [eax+offset1], reg_type##4 \
72 __asm MOVNTQ [eax+offset2], reg_type##7 \
74 #elif defined(_M_AMD64)
75 #define OUTPUT_RGBA_32(mov_instr, reg_type, offset0, offset1, offset2) \
76 { \
77 xmm3 = _mm_load_si128((__m128i*)simd_table+8); \
78 xmm4 = _mm_unpackhi_epi8(xmm1, xmm2); \
79 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
80 xmm5 = _mm_unpackhi_epi8(xmm0, xmm3); \
81 xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \
82 xmm6 = _mm_unpackhi_epi8(xmm1, xmm0); \
83 xmm1 = _mm_unpacklo_epi8(xmm1, xmm0); \
84 xmm7 = _mm_unpackhi_epi8(xmm4, xmm5); \
85 xmm4 = _mm_unpacklo_epi8(xmm4, xmm5); \
86 _mm_storeu_si128(dst, xmm1); \
87 _mm_storeu_si128(dst + offset0, xmm6); \
88 _mm_storeu_si128(dst + offset1, xmm4); \
89 _mm_storeu_si128(dst + offset2, xmm7); \
91 #endif
93 #if defined(_M_IX86)
94 #define OUTPUT_ARGB_32(mov_instr, reg_type, offset0, offset1, offset2) \
95 __asm { \
96 __asm mov eax, dst \
97 __asm mov_instr reg_type##3, [simd_table+128] \
98 __asm mov_instr reg_type##4, reg_type##3 \
99 __asm mov_instr reg_type##5, reg_type##2 \
100 __asm punpcklbw reg_type##2, reg_type##0 \
101 __asm punpcklbw reg_type##3, reg_type##1 \
102 __asm punpckhbw reg_type##5, reg_type##0 \
103 __asm punpckhbw reg_type##4, reg_type##1 \
104 __asm mov_instr reg_type##0, reg_type##3 \
105 __asm mov_instr reg_type##1, reg_type##4 \
106 __asm punpcklwd reg_type##3, reg_type##2 \
107 __asm punpckhwd reg_type##0, reg_type##2 \
108 __asm punpcklwd reg_type##4, reg_type##5 \
109 __asm punpckhwd reg_type##1, reg_type##5 \
110 __asm MOVNTQ [eax], reg_type##3 \
111 __asm MOVNTQ [eax+offset0], reg_type##0 \
112 __asm MOVNTQ [eax+offset1], reg_type##4 \
113 __asm MOVNTQ [eax+offset2], reg_type##1 \
115 #elif defined(_M_AMD64)
116 #define OUTPUT_ARGB_32(mov_instr, reg_type, offset0, offset1, offset2) \
118 xmm3 = _mm_load_si128((__m128i*)simd_table+8); \
119 xmm5 = _mm_unpackhi_epi8(xmm2, xmm0); \
120 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
121 xmm4 = _mm_unpackhi_epi8(xmm3, xmm1); \
122 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
123 xmm0 = _mm_unpackhi_epi16(xmm3, xmm2); \
124 xmm3 = _mm_unpacklo_epi16(xmm3, xmm2); \
125 xmm1 = _mm_unpackhi_epi16(xmm4, xmm5); \
126 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
127 _mm_storeu_si128(dst, xmm3); \
128 _mm_storeu_si128(dst + offset0, xmm0); \
129 _mm_storeu_si128(dst + offset1, xmm4); \
130 _mm_storeu_si128(dst + offset2, xmm1); \
132 #endif
134 #if defined(_M_IX86)
135 #define OUTPUT_BGRA_32(mov_instr, reg_type, offset0, offset1, offset2) \
136 __asm { \
137 __asm mov eax, dst \
138 __asm mov_instr reg_type##3, [simd_table+128] \
139 __asm mov_instr reg_type##4, reg_type##0 \
140 __asm mov_instr reg_type##5, reg_type##1 \
141 __asm punpcklbw reg_type##0, reg_type##2 \
142 __asm punpcklbw reg_type##1, reg_type##3 \
143 __asm punpckhbw reg_type##4, reg_type##2 \
144 __asm punpckhbw reg_type##5, reg_type##3 \
145 __asm mov_instr reg_type##6, reg_type##0 \
146 __asm mov_instr reg_type##7, reg_type##4 \
147 __asm punpcklwd reg_type##0, reg_type##1 \
148 __asm punpckhwd reg_type##6, reg_type##1 \
149 __asm punpcklwd reg_type##4, reg_type##5 \
150 __asm punpckhwd reg_type##7, reg_type##5 \
151 __asm MOVNTQ [eax], reg_type##0 \
152 __asm MOVNTQ [eax+offset0], reg_type##6 \
153 __asm MOVNTQ [eax+offset1], reg_type##4 \
154 __asm MOVNTQ [eax+offset2], reg_type##7 \
156 #elif defined(_M_AMD64)
157 #define OUTPUT_BGRA_32(mov_instr, reg_type, offset0, offset1, offset2) \
159 xmm3 = _mm_load_si128((__m128i*)simd_table+8); \
160 xmm4 = _mm_unpackhi_epi8(xmm0, xmm2); \
161 xmm0 = _mm_unpacklo_epi8(xmm0, xmm2); \
162 xmm5 = _mm_unpackhi_epi8(xmm1, xmm3); \
163 xmm1 = _mm_unpacklo_epi8(xmm1, xmm3); \
164 xmm6 = _mm_unpackhi_epi8(xmm0, xmm1); \
165 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
166 xmm7 = _mm_unpackhi_epi8(xmm4, xmm5); \
167 xmm4 = _mm_unpacklo_epi8(xmm4, xmm5); \
168 _mm_storeu_si128(dst, xmm0); \
169 _mm_storeu_si128(dst + offset0, xmm6); \
170 _mm_storeu_si128(dst + offset1, xmm4); \
171 _mm_storeu_si128(dst + offset2, xmm7); \
173 #endif
175 #if defined(_M_IX86)
176 #define YUV_2_RGB(mov_instr, reg_type) \
177 __asm { \
178 __asm punpcklbw reg_type##0, reg_type##4 /* mm0 = u3 u2 u1 u0 */\
179 __asm punpcklbw reg_type##1, reg_type##4 /* mm1 = v3 v2 v1 v0 */\
180 __asm psubsw reg_type##0, [simd_table] /* u -= 128 */\
181 __asm psubsw reg_type##1, [simd_table] /* v -= 128 */\
182 __asm psllw reg_type##0, 3 /* promote precision */\
183 __asm psllw reg_type##1, 3 /* promote precision */\
184 __asm mov_instr reg_type##2, reg_type##0 /* mm2 = u3 u2 u1 u0 */\
185 __asm mov_instr reg_type##3, reg_type##1 /* mm3 = v3 v2 v1 v0 */\
186 __asm pmulhw reg_type##2, [simd_table+16] /* mm2 = u * u_green */\
187 __asm pmulhw reg_type##3, [simd_table+32] /* mm3 = v * v_green */\
188 __asm pmulhw reg_type##0, [simd_table+48] /* mm0 = chroma_b */\
189 __asm pmulhw reg_type##1, [simd_table+64] /* mm1 = chroma_r */\
190 __asm paddsw reg_type##2, reg_type##3 /* mm2 = chroma_g */\
191 __asm psubusb reg_type##6, [simd_table+80] /* Y -= 16 */\
192 __asm mov_instr reg_type##7, reg_type##6 /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
193 __asm pand reg_type##6, [simd_table+112] /* mm6 = Y6 Y4 Y2 Y0 */\
194 __asm psrlw reg_type##7, 8 /* mm7 = Y7 Y5 Y3 Y1 */\
195 __asm psllw reg_type##6, 3 /* promote precision */\
196 __asm psllw reg_type##7, 3 /* promote precision */\
197 __asm pmulhw reg_type##6, [simd_table+96] /* mm6 = luma_rgb even */\
198 __asm pmulhw reg_type##7, [simd_table+96] /* mm7 = luma_rgb odd */\
199 __asm mov_instr reg_type##3, reg_type##0 /* mm3 = chroma_b */\
200 __asm mov_instr reg_type##4, reg_type##1 /* mm4 = chroma_r */\
201 __asm mov_instr reg_type##5, reg_type##2 /* mm5 = chroma_g */\
202 __asm paddsw reg_type##0, reg_type##6 /* mm0 = B6 B4 B2 B0 */\
203 __asm paddsw reg_type##3, reg_type##7 /* mm3 = B7 B5 B3 B1 */\
204 __asm paddsw reg_type##1, reg_type##6 /* mm1 = R6 R4 R2 R0 */\
205 __asm paddsw reg_type##4, reg_type##7 /* mm4 = R7 R5 R3 R1 */\
206 __asm paddsw reg_type##2, reg_type##6 /* mm2 = G6 G4 G2 G0 */\
207 __asm paddsw reg_type##5, reg_type##7 /* mm5 = G7 G5 G3 G1 */\
208 __asm packuswb reg_type##0, reg_type##0 /* saturate to 0-255 */\
209 __asm packuswb reg_type##1, reg_type##1 /* saturate to 0-255 */\
210 __asm packuswb reg_type##2, reg_type##2 /* saturate to 0-255 */\
211 __asm packuswb reg_type##3, reg_type##3 /* saturate to 0-255 */\
212 __asm packuswb reg_type##4, reg_type##4 /* saturate to 0-255 */\
213 __asm packuswb reg_type##5, reg_type##5 /* saturate to 0-255 */\
214 __asm punpcklbw reg_type##0, reg_type##3 /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */\
215 __asm punpcklbw reg_type##1, reg_type##4 /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */\
216 __asm punpcklbw reg_type##2, reg_type##5 /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */\
219 #define YUV444_2_RGB(mov_instr, reg_type) \
220 __asm { \
221 __asm psubusb reg_type##6, [simd_table+80] /* Y -= 16 */ \
222 __asm mov_instr reg_type##7, reg_type##6 /* mm7 = Y */ \
223 __asm mov_instr reg_type##5, reg_type##0 /* mm5 = U */ \
224 __asm mov_instr reg_type##3, reg_type##1 /* mm3 = V */ \
225 __asm punpcklbw reg_type##6, reg_type##4 /* mm6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
226 __asm punpcklbw reg_type##0, reg_type##4 /* mm0: U7 U6 U5 U4 U3 U2 U1 */ \
227 __asm punpcklbw reg_type##1, reg_type##4 /* mm1: V7 V6 V5 V4 V3 V2 V1 V0 */ \
228 __asm punpckhbw reg_type##7, reg_type##4 /* mm7: YF YE YD YC YB YA Y9 Y8 */ \
229 __asm punpckhbw reg_type##5, reg_type##4 /* mm5: UF UE UD UC UB UA U9 U8 */ \
230 __asm punpckhbw reg_type##3, reg_type##4 /* mm3: VF VE VD VC VB VA V9 V8 */ \
231 __asm psubsw reg_type##0, [simd_table] /* Ul -= 128 */ \
232 __asm psubsw reg_type##1, [simd_table] /* Vl -= 128 */ \
233 __asm psubsw reg_type##5, [simd_table] /* Uh -= 128 */ \
234 __asm psubsw reg_type##3, [simd_table] /* Vh -= 128 */ \
235 __asm psllw reg_type##0, 3 /* promote precision */ \
236 __asm psllw reg_type##1, 3 /* promote precision */ \
237 __asm psllw reg_type##3, 3 /* promote precision */ \
238 __asm psllw reg_type##5, 3 /* promote precision */ \
239 __asm psllw reg_type##6, 3 /* promote precision */ \
240 __asm psllw reg_type##7, 3 /* promote precision */ \
241 __asm mov_instr reg_type##2, [simd_table+16] /* gU constant */ \
242 __asm mov_instr reg_type##4, [simd_table+32] /* gV constant */ \
243 __asm pmulhw reg_type##6, [simd_table+96] /* mm6 = luma_rgb low */ \
244 __asm pmulhw reg_type##7, [simd_table+96] /* mm7 = luma_rgb high */ \
245 __asm pmulhw reg_type##2, reg_type##0 /* mm2 = Ul * u_green */ \
246 __asm pmulhw reg_type##4, reg_type##1 /* mm4 = Vl * v_green */ \
247 __asm pmulhw reg_type##0, [simd_table+48] /* mm0 = chroma_b */ \
248 __asm pmulhw reg_type##1, [simd_table+64] /* mm1 = chroma_r */ \
249 __asm paddsw reg_type##2, reg_type##4 /* mm2 = chroma_g */ \
250 __asm paddsw reg_type##0, reg_type##6 /* mm0 = B low */ \
251 __asm paddsw reg_type##1, reg_type##6 /* mm1 = R low */ \
252 __asm paddsw reg_type##2, reg_type##6 /* mm2 = G low */ \
253 __asm mov_instr reg_type##6, [simd_table+16] /* gU constant */ \
254 __asm mov_instr reg_type##4, [simd_table+32] /* gV constant */ \
255 __asm pmulhw reg_type##6, reg_type##5 /* mm6 = Uh * u_green */ \
256 __asm pmulhw reg_type##4, reg_type##3 /* mm4 = Vh * v_green */ \
257 __asm pmulhw reg_type##5, [simd_table+48] /* mm5 = chroma_b */ \
258 __asm pmulhw reg_type##3, [simd_table+64] /* mm3 = chroma_r */ \
259 __asm paddsw reg_type##6, reg_type##4 /* mm6 = chroma_g */ \
260 __asm paddsw reg_type##5, reg_type##7 /* mm5 = B high */ \
261 __asm paddsw reg_type##3, reg_type##7 /* mm3 = R high */ \
262 __asm paddsw reg_type##6, reg_type##7 /* mm6 = G high */ \
263 __asm packuswb reg_type##0, reg_type##5 /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */ \
264 __asm packuswb reg_type##1, reg_type##3 /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */ \
265 __asm packuswb reg_type##2, reg_type##6 /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */ \
268 #elif defined(_M_AMD64)
269 #define YUV_2_RGB(mov_instr, reg_type) \
271 xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); /* mm0 = u3 u2 u1 u0 */\
272 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); /* mm1 = v3 v2 v1 v0 */\
273 xmm0 = _mm_subs_epi16(xmm0, _mm_load_si128((__m128i*)simd_table)); /* u -= 128 */\
274 xmm1 = _mm_subs_epi16(xmm1, _mm_load_si128((__m128i*)simd_table)); /* v -= 128 */\
275 xmm0 = _mm_slli_epi16(xmm0, 3); /* promote precision */\
276 xmm1 = _mm_slli_epi16(xmm1, 3); /* promote precision */\
277 xmm2 = _mm_mulhi_epi16(xmm0, _mm_load_si128((__m128i*)simd_table+1)); /* mm2 = u * u_green */\
278 xmm3 = _mm_mulhi_epi16(xmm1, _mm_load_si128((__m128i*)simd_table+2)); /* mm3 = v * v_green */\
279 xmm0 = _mm_mulhi_epi16(xmm0, _mm_load_si128((__m128i*)simd_table+3)); /* mm0 = chroma_b */\
280 xmm1 = _mm_mulhi_epi16(xmm1, _mm_load_si128((__m128i*)simd_table+4)); /* mm1 = chroma_r */\
281 xmm2 = _mm_adds_epi16(xmm2, xmm3); /* mm2 = chroma_g */\
282 xmm6 = _mm_subs_epu8(xmm6, _mm_load_si128((__m128i*)simd_table+5)); /* Y -= 16 */\
283 xmm7 = xmm6; /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
284 xmm6 = _mm_and_si128(xmm6, _mm_load_si128((__m128i*)simd_table+7)); /* mm6 = Y6 Y4 Y2 Y0 */\
285 xmm7 = _mm_srli_epi16(xmm7, 8); /* mm7 = Y7 Y5 Y3 Y1 */\
286 xmm6 = _mm_slli_epi16(xmm6, 3); /* promote precision */\
287 xmm7 = _mm_slli_epi16(xmm7, 3); /* promote precision */\
288 xmm6 = _mm_mulhi_epi16(xmm6, _mm_load_si128((__m128i*)simd_table+6)); /* mm6 = luma_rgb even */\
289 xmm7 = _mm_mulhi_epi16(xmm7, _mm_load_si128((__m128i*)simd_table+6)); /* mm7 = luma_rgb odd */\
290 xmm3 = xmm0; /* mm3 = chroma_b */\
291 xmm4 = xmm1; /* mm4 = chroma_r */\
292 xmm5 = xmm2; /* mm5 = chroma_g */\
293 xmm0 = _mm_adds_epi16(xmm0, xmm6); /* mm0 = B6 B4 B2 B0 */\
294 xmm3 = _mm_adds_epi16(xmm3, xmm7); /* mm3 = B7 B5 B3 B1 */\
295 xmm1 = _mm_adds_epi16(xmm1, xmm6); /* mm1 = R6 R4 R2 R0 */\
296 xmm4 = _mm_adds_epi16(xmm4, xmm7); /* mm4 = R7 R5 R3 R1 */\
297 xmm2 = _mm_adds_epi16(xmm2, xmm6); /* mm2 = G6 G4 G2 G0 */\
298 xmm5 = _mm_adds_epi16(xmm5, xmm7); /* mm5 = G7 G5 G3 G1 */\
299 xmm0 = _mm_packus_epi16(xmm0, xmm0); /* saturate to 0-255 */\
300 xmm1 = _mm_packus_epi16(xmm1, xmm1); /* saturate to 0-255 */\
301 xmm2 = _mm_packus_epi16(xmm2, xmm2); /* saturate to 0-255 */\
302 xmm3 = _mm_packus_epi16(xmm3, xmm3); /* saturate to 0-255 */\
303 xmm4 = _mm_packus_epi16(xmm4, xmm4); /* saturate to 0-255 */\
304 xmm5 = _mm_packus_epi16(xmm5, xmm5); /* saturate to 0-255 */\
305 xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */\
306 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */\
307 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */\
310 #define YUV444_2_RGB(mov_instr, reg_type) \
312 xmm6 = _mm_subs_epu8(xmm6, _mm_load_si128((__m128i*)simd_table+5)); /* Y -= 16 */ \
313 xmm7 = xmm6; /* mm7 = Y */ \
314 xmm5 = xmm0; /* mm5 = U */ \
315 xmm3 = xmm1; /* mm3 = V */ \
316 xmm6 = _mm_unpacklo_epi8(xmm6, xmm4); /* mm6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
317 xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); /* mm0: U7 U6 U5 U4 U3 U2 U1 U0 */ \
318 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); /* mm1: V7 V6 V5 V4 V3 V2 V1 V0 */ \
319 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); /* mm7: YF YE YD YC YB YA Y9 Y8 */ \
320 xmm5 = _mm_unpackhi_epi8(xmm5, xmm4); /* mm5: UF UE UD UC UB UA U9 U8 */ \
321 xmm3 = _mm_unpackhi_epi8(xmm3, xmm4); /* mm3: VF VE VD VC VB VA V9 V8 */ \
322 xmm0 = _mm_subs_epi16(xmm0, _mm_load_si128((__m128i*)simd_table)); /* Ul -= 128 */ \
323 xmm1 = _mm_subs_epi16(xmm1, _mm_load_si128((__m128i*)simd_table)); /* Vl -= 128 */ \
324 xmm5 = _mm_subs_epi16(xmm5, _mm_load_si128((__m128i*)simd_table)); /* Uh -= 128 */ \
325 xmm3 = _mm_subs_epi16(xmm3, _mm_load_si128((__m128i*)simd_table)); /* Vh -= 128 */ \
326 xmm0 = _mm_slli_epi16(xmm0, 3); /* promote precision */ \
327 xmm1 = _mm_slli_epi16(xmm1, 3); /* promote precision */ \
328 xmm3 = _mm_slli_epi16(xmm3, 3); /* promote precision */ \
329 xmm5 = _mm_slli_epi16(xmm5, 3); /* promote precision */ \
330 xmm6 = _mm_slli_epi16(xmm6, 3); /* promote precision */ \
331 xmm7 = _mm_slli_epi16(xmm7, 3); /* promote precision */ \
332 xmm2 = _mm_load_si128((__m128i*)simd_table+1); /* gU constant */ \
333 xmm4 = _mm_load_si128((__m128i*)simd_table+2); /* gV constant */ \
334 xmm6 = _mm_mulhi_epi16(xmm6, _mm_load_si128((__m128i*)simd_table+6)); /* mm6 = luma_rgb low */ \
335 xmm7 = _mm_mulhi_epi16(xmm7, _mm_load_si128((__m128i*)simd_table+6)); /* mm7 = luma_rgb high */ \
336 xmm2 = _mm_mulhi_epi16(xmm2, xmm0); /* mm2 = Ul * u_green */ \
337 xmm4 = _mm_mulhi_epi16(xmm4, xmm1); /* mm4 = Vl * v_green */ \
338 xmm0 = _mm_mulhi_epi16(xmm0, _mm_load_si128((__m128i*)simd_table+3)); /* mm0 = chroma_b */ \
339 xmm1 = _mm_mulhi_epi16(xmm1, _mm_load_si128((__m128i*)simd_table+4)); /* mm1 = chroma_r */ \
340 xmm2 = _mm_adds_epi16(xmm2, xmm4); /* mm2 = chroma_g */ \
341 xmm0 = _mm_adds_epi16(xmm0, xmm6); /* mm0 = B low */ \
342 xmm1 = _mm_adds_epi16(xmm1, xmm6); /* mm1 = R low */ \
343 xmm2 = _mm_adds_epi16(xmm2, xmm6); /* mm2 = G low */ \
344 xmm6 = _mm_load_si128((__m128i*)simd_table+1); /* gU constant */ \
345 xmm4 = _mm_load_si128((__m128i*)simd_table+2); /* gV constant */ \
346 xmm6 = _mm_mulhi_epi16(xmm6, xmm5); /* mm6 = Uh * u_green */ \
347 xmm4 = _mm_mulhi_epi16(xmm4, xmm3); /* mm4 = Vh * v_green */ \
348 xmm5 = _mm_mulhi_epi16(xmm5, _mm_load_si128((__m128i*)simd_table+3)); /* mm5 = chroma_b */ \
349 xmm3 = _mm_mulhi_epi16(xmm3, _mm_load_si128((__m128i*)simd_table+4)); /* mm3 = chroma_r */ \
350 xmm6 = _mm_adds_epi16(xmm6, xmm4); /* mm6 = chroma_g */ \
351 xmm5 = _mm_adds_epi16(xmm5, xmm7); /* mm5 = B high */ \
352 xmm3 = _mm_adds_epi16(xmm3, xmm7); /* mm3 = R high */ \
353 xmm6 = _mm_adds_epi16(xmm6, xmm7); /* mm6 = G high */ \
354 xmm0 = _mm_packus_epi16(xmm0, xmm5); /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */ \
355 xmm1 = _mm_packus_epi16(xmm1, xmm3); /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */ \
356 xmm2 = _mm_packus_epi16(xmm2, xmm6); /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */ \
358 #endif
360 #endif