Fix the typo for MSVC x64 SSE2 support
[liboggplay.git] / src / liboggplay / x86 / yuv2rgb_x86_vs.h
blobc380e0200aa0f1b4251f12e8d18dad546e40521f
1 #ifndef __OGGPLAY_YUV2RGB_VS_H__
2 #define __OGGPLAY_YUV2RGB_VS_H__
4 #define ATTR_ALIGN(_align) __declspec(align(_align))
6 #define emms() __asm emms
7 #define MMX_MOVNTQ movq
8 #define SSE_MOVNTQ movntq
9 #define SSE2_MOVNTQ movdqu
11 #if defined(_M_IX86)
12 #define LOAD_YUV_PLANAR_2(mov_instr, reg_type) \
13 __asm { \
14 __asm mov eax, py \
15 __asm mov edx, pu \
16 __asm mov_instr reg_type##6, [eax] \
17 __asm mov_instr reg_type##0, [edx] \
18 __asm mov eax, pv \
19 __asm mov_instr reg_type##1, [eax] \
20 __asm pxor reg_type##4, reg_type##4 \
22 #elif defined(_M_AMD64)
23 #define LOAD_YUV_PLANAR_2(mov_instr, reg_type) \
24 { \
25 xmm6 = _mm_loadu_si128((__m128i*)py); \
26 xmm0 = _mm_loadu_si128((__m128i*)pu); \
27 xmm1 = _mm_loadu_si128((__m128i*)pv); \
28 xmm4 = _mm_setzero_si128(); \
30 #endif
32 #if defined(_M_IX86)
33 #define OUTPUT_RGBA_32(mov_instr, reg_type, offset0, offset1, offset2) \
34 __asm { \
35 __asm mov eax, dst \
36 __asm mov_instr reg_type##3, [simd_table+128] \
37 __asm mov_instr reg_type##4, reg_type##1 \
38 __asm mov_instr reg_type##5, reg_type##0 \
39 __asm punpcklbw reg_type##1, reg_type##2 \
40 __asm punpcklbw reg_type##0, reg_type##3 \
41 __asm punpckhbw reg_type##4, reg_type##2 \
42 __asm punpckhbw reg_type##5, reg_type##3 \
43 __asm mov_instr reg_type##6, reg_type##1 \
44 __asm mov_instr reg_type##7, reg_type##4 \
45 __asm punpcklwd reg_type##1, reg_type##0 \
46 __asm punpckhwd reg_type##6, reg_type##0 \
47 __asm punpcklwd reg_type##4, reg_type##5 \
48 __asm punpckhwd reg_type##7, reg_type##5 \
49 __asm MOVNTQ [eax], reg_type##1 \
50 __asm MOVNTQ [eax+offset0], reg_type##6 \
51 __asm MOVNTQ [eax+offset1], reg_type##4 \
52 __asm MOVNTQ [eax+offset2], reg_type##7 \
54 #elif defined(_M_AMD64)
55 #define OUTPUT_RGBA_32(mov_instr, reg_type, offset0, offset1, offset2) \
56 { \
57 xmm3 = _mm_load_si128((__m128i*)simd_table+8); \
58 xmm4 = _mm_unpackhi_epi8(xmm1, xmm2); \
59 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
60 xmm5 = _mm_unpackhi_epi8(xmm0, xmm3); \
61 xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \
62 xmm6 = _mm_unpackhi_epi8(xmm1, xmm0); \
63 xmm1 = _mm_unpacklo_epi8(xmm1, xmm0); \
64 xmm7 = _mm_unpackhi_epi8(xmm4, xmm5); \
65 xmm4 = _mm_unpacklo_epi8(xmm4, xmm5); \
66 _mm_storeu_si128(dst, xmm1); \
67 _mm_storeu_si128(dst + offset0, xmm6); \
68 _mm_storeu_si128(dst + offset1, xmm4); \
69 _mm_storeu_si128(dst + offset2, xmm7); \
71 #endif
73 #if defined(_M_IX86)
74 #define OUTPUT_ARGB_32(mov_instr, reg_type, offset0, offset1, offset2) \
75 __asm { \
76 __asm mov eax, dst \
77 __asm mov_instr reg_type##3, [simd_table+128] \
78 __asm mov_instr reg_type##4, reg_type##3 \
79 __asm mov_instr reg_type##5, reg_type##2 \
80 __asm punpcklbw reg_type##2, reg_type##0 \
81 __asm punpcklbw reg_type##3, reg_type##1 \
82 __asm punpckhbw reg_type##5, reg_type##0 \
83 __asm punpckhbw reg_type##4, reg_type##1 \
84 __asm mov_instr reg_type##0, reg_type##3 \
85 __asm mov_instr reg_type##1, reg_type##4 \
86 __asm punpcklwd reg_type##3, reg_type##2 \
87 __asm punpckhwd reg_type##0, reg_type##2 \
88 __asm punpcklwd reg_type##4, reg_type##5 \
89 __asm punpckhwd reg_type##1, reg_type##5 \
90 __asm MOVNTQ [eax], reg_type##3 \
91 __asm MOVNTQ [eax+offset0], reg_type##0 \
92 __asm MOVNTQ [eax+offset1], reg_type##4 \
93 __asm MOVNTQ [eax+offset2], reg_type##1 \
95 #elif defined(_M_AMD64)
96 #define OUTPUT_ARGB_32(mov_instr, reg_type, offset0, offset1, offset2) \
97 { \
98 xmm3 = _mm_load_si128((__m128i*)simd_table+8); \
99 xmm5 = _mm_unpackhi_epi8(xmm2, xmm0); \
100 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
101 xmm4 = _mm_unpackhi_epi8(xmm3, xmm1); \
102 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
103 xmm0 = _mm_unpackhi_epi16(xmm3, xmm2); \
104 xmm3 = _mm_unpacklo_epi16(xmm3, xmm2); \
105 xmm1 = _mm_unpackhi_epi16(xmm4, xmm5); \
106 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
107 _mm_storeu_si128(dst, xmm3); \
108 _mm_storeu_si128(dst + offset0, xmm0); \
109 _mm_storeu_si128(dst + offset1, xmm4); \
110 _mm_storeu_si128(dst + offset2, xmm1); \
112 #endif
114 #if defined(_M_IX86)
115 #define OUTPUT_BGRA_32(mov_instr, reg_type, offset0, offset1, offset2) \
116 __asm { \
117 __asm mov eax, dst \
118 __asm mov_instr reg_type##3, [simd_table+128] \
119 __asm mov_instr reg_type##4, reg_type##0 \
120 __asm mov_instr reg_type##5, reg_type##1 \
121 __asm punpcklbw reg_type##0, reg_type##2 \
122 __asm punpcklbw reg_type##1, reg_type##3 \
123 __asm punpckhbw reg_type##4, reg_type##2 \
124 __asm punpckhbw reg_type##5, reg_type##3 \
125 __asm mov_instr reg_type##6, reg_type##0 \
126 __asm mov_instr reg_type##7, reg_type##4 \
127 __asm punpcklwd reg_type##0, reg_type##1 \
128 __asm punpckhwd reg_type##6, reg_type##1 \
129 __asm punpcklwd reg_type##4, reg_type##5 \
130 __asm punpckhwd reg_type##7, reg_type##5 \
131 __asm MOVNTQ [eax], reg_type##0 \
132 __asm MOVNTQ [eax+offset0], reg_type##6 \
133 __asm MOVNTQ [eax+offset1], reg_type##4 \
134 __asm MOVNTQ [eax+offset2], reg_type##7 \
136 #elif defined(_M_AMD64)
137 #define OUTPUT_BGRA_32(mov_instr, reg_type, offset0, offset1, offset2) \
139 xmm3 = _mm_load_si128((__m128i*)simd_table+8); \
140 xmm4 = _mm_unpackhi_epi8(xmm0, xmm2); \
141 xmm0 = _mm_unpacklo_epi8(xmm0, xmm2); \
142 xmm5 = _mm_unpackhi_epi8(xmm1, xmm3); \
143 xmm1 = _mm_unpacklo_epi8(xmm1, xmm3); \
144 xmm6 = _mm_unpackhi_epi8(xmm0, xmm1); \
145 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
146 xmm7 = _mm_unpackhi_epi8(xmm4, xmm5); \
147 xmm4 = _mm_unpacklo_epi8(xmm4, xmm5); \
148 _mm_storeu_si128(dst, xmm0); \
149 _mm_storeu_si128(dst + offset0, xmm6); \
150 _mm_storeu_si128(dst + offset1, xmm4); \
151 _mm_storeu_si128(dst + offset2, xmm7); \
153 #endif
155 #if defined(_M_IX86)
156 #define YUV_2_RGB(mov_instr, reg_type) \
157 __asm { \
158 __asm punpcklbw reg_type##0, reg_type##4 /* mm0 = u3 u2 u1 u0 */\
159 __asm punpcklbw reg_type##1, reg_type##4 /* mm1 = v3 v2 v1 v0 */\
160 __asm psubsw reg_type##0, [simd_table] /* u -= 128 */\
161 __asm psubsw reg_type##1, [simd_table] /* v -= 128 */\
162 __asm psllw reg_type##0, 3 /* promote precision */\
163 __asm psllw reg_type##1, 3 /* promote precision */\
164 __asm mov_instr reg_type##2, reg_type##0 /* mm2 = u3 u2 u1 u0 */\
165 __asm mov_instr reg_type##3, reg_type##1 /* mm3 = v3 v2 v1 v0 */\
166 __asm pmulhw reg_type##2, [simd_table+16] /* mm2 = u * u_green */\
167 __asm pmulhw reg_type##3, [simd_table+32] /* mm3 = v * v_green */\
168 __asm pmulhw reg_type##0, [simd_table+48] /* mm0 = chroma_b */\
169 __asm pmulhw reg_type##1, [simd_table+64] /* mm1 = chroma_r */\
170 __asm paddsw reg_type##2, reg_type##3 /* mm2 = chroma_g */\
171 __asm psubusb reg_type##6, [simd_table+80] /* Y -= 16 */\
172 __asm mov_instr reg_type##7, reg_type##6 /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
173 __asm pand reg_type##6, [simd_table+112] /* mm6 = Y6 Y4 Y2 Y0 */\
174 __asm psrlw reg_type##7, 8 /* mm7 = Y7 Y5 Y3 Y1 */\
175 __asm psllw reg_type##6, 3 /* promote precision */\
176 __asm psllw reg_type##7, 3 /* promote precision */\
177 __asm pmulhw reg_type##6, [simd_table+96] /* mm6 = luma_rgb even */\
178 __asm pmulhw reg_type##7, [simd_table+96] /* mm7 = luma_rgb odd */\
179 __asm mov_instr reg_type##3, reg_type##0 /* mm3 = chroma_b */\
180 __asm mov_instr reg_type##4, reg_type##1 /* mm4 = chroma_r */\
181 __asm mov_instr reg_type##5, reg_type##2 /* mm5 = chroma_g */\
182 __asm paddsw reg_type##0, reg_type##6 /* mm0 = B6 B4 B2 B0 */\
183 __asm paddsw reg_type##3, reg_type##7 /* mm3 = B7 B5 B3 B1 */\
184 __asm paddsw reg_type##1, reg_type##6 /* mm1 = R6 R4 R2 R0 */\
185 __asm paddsw reg_type##4, reg_type##7 /* mm4 = R7 R5 R3 R1 */\
186 __asm paddsw reg_type##2, reg_type##6 /* mm2 = G6 G4 G2 G0 */\
187 __asm paddsw reg_type##5, reg_type##7 /* mm5 = G7 G5 G3 G1 */\
188 __asm packuswb reg_type##0, reg_type##0 /* saturate to 0-255 */\
189 __asm packuswb reg_type##1, reg_type##1 /* saturate to 0-255 */\
190 __asm packuswb reg_type##2, reg_type##2 /* saturate to 0-255 */\
191 __asm packuswb reg_type##3, reg_type##3 /* saturate to 0-255 */\
192 __asm packuswb reg_type##4, reg_type##4 /* saturate to 0-255 */\
193 __asm packuswb reg_type##5, reg_type##5 /* saturate to 0-255 */\
194 __asm punpcklbw reg_type##0, reg_type##3 /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */\
195 __asm punpcklbw reg_type##1, reg_type##4 /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */\
196 __asm punpcklbw reg_type##2, reg_type##5 /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */\
198 #elif defined(_M_AMD64)
199 #define YUV_2_RGB(mov_instr, reg_type) \
201 xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); /* mm0 = u3 u2 u1 u0 */\
202 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); /* mm1 = v3 v2 v1 v0 */\
203 xmm0 = _mm_subs_epi16(xmm0, _mm_load_si128((__m128i*)simd_table)); /* u -= 128 */\
204 xmm1 = _mm_subs_epi16(xmm1, _mm_load_si128((__m128i*)simd_table)); /* v -= 128 */\
205 xmm0 = _mm_slli_epi16(xmm0, 3); /* promote precision */\
206 xmm1 = _mm_slli_epi16(xmm1, 3); /* promote precision */\
207 xmm2 = _mm_mulhi_epi16(xmm0, _mm_load_si128((__m128i*)simd_table+1)); /* mm2 = u * u_green */\
208 xmm3 = _mm_mulhi_epi16(xmm1, _mm_load_si128((__m128i*)simd_table+2)); /* mm3 = v * v_green */\
209 xmm0 = _mm_mulhi_epi16(xmm0, _mm_load_si128((__m128i*)simd_table+3)); /* mm0 = chroma_b */\
210 xmm1 = _mm_mulhi_epi16(xmm1, _mm_load_si128((__m128i*)simd_table+4)); /* mm1 = chroma_r */\
211 xmm2 = _mm_adds_epi16(xmm2, xmm3); /* mm2 = chroma_g */\
212 xmm6 = _mm_subs_epu8(xmm6, _mm_load_si128((__m128i*)simd_table+5)); /* Y -= 16 */\
213 xmm7 = xmm6; /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
214 xmm6 = _mm_and_si128(xmm6, _mm_load_si128((__m128i*)simd_table+7)); /* mm6 = Y6 Y4 Y2 Y0 */\
215 xmm7 = _mm_srli_epi16(xmm7, 8); /* mm7 = Y7 Y5 Y3 Y1 */\
216 xmm6 = _mm_slli_epi16(xmm6, 3); /* promote precision */\
217 xmm7 = _mm_slli_epi16(xmm7, 3); /* promote precision */\
218 xmm6 = _mm_mulhi_epi16(xmm6, _mm_load_si128((__m128i*)simd_table+6)); /* mm6 = luma_rgb even */\
219 xmm7 = _mm_mulhi_epi16(xmm7, _mm_load_si128((__m128i*)simd_table+6)); /* mm7 = luma_rgb odd */\
220 xmm3 = xmm0; /* mm3 = chroma_b */\
221 xmm4 = xmm1; /* mm4 = chroma_r */\
222 xmm5 = xmm2; /* mm5 = chroma_g */\
223 xmm0 = _mm_adds_epi16(xmm0, xmm6); /* mm0 = B6 B4 B2 B0 */\
224 xmm3 = _mm_adds_epi16(xmm3, xmm7); /* mm3 = B7 B5 B3 B1 */\
225 xmm1 = _mm_adds_epi16(xmm1, xmm6); /* mm1 = R6 R4 R2 R0 */\
226 xmm4 = _mm_adds_epi16(xmm4, xmm7); /* mm4 = R7 R5 R3 R1 */\
227 xmm2 = _mm_adds_epi16(xmm2, xmm6); /* mm2 = G6 G4 G2 G0 */\
228 xmm5 = _mm_adds_epi16(xmm5, xmm7); /* mm5 = G7 G5 G3 G1 */\
229 xmm0 = _mm_packus_epi16(xmm0, xmm0); /* saturate to 0-255 */\
230 xmm1 = _mm_packus_epi16(xmm1, xmm1); /* saturate to 0-255 */\
231 xmm2 = _mm_packus_epi16(xmm2, xmm2); /* saturate to 0-255 */\
232 xmm3 = _mm_packus_epi16(xmm3, xmm3); /* saturate to 0-255 */\
233 xmm4 = _mm_packus_epi16(xmm4, xmm4); /* saturate to 0-255 */\
234 xmm5 = _mm_packus_epi16(xmm5, xmm5); /* saturate to 0-255 */\
235 xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */\
236 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */\
237 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */\
239 #endif
241 #endif