X64 transport [Part 3](Update VD)
[xy_vsfilter.git] / src / dsutil / vd_asm.cpp
blob2b836465ac97d0c3e54e03475a40a7e326156c81
1 // VirtualDub - Video processing and capture application
2 // Graphics support library
3 // Copyright (C) 1998-2007 Avery Lee
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software
17 // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 // Notes:
20 // - VDPixmapBlt is from VirtualDub
21 // - sse2 yv12 to yuy2 conversion by Haali
22 // (- vd.cpp/h should be renamed to something more sensible already :)
25 #include "stdafx.h"
26 #include "vd_asm.h"
28 #pragma warning(disable : 4799) // no emms... blahblahblah
30 #ifndef _WIN64
31 void __declspec(naked) yuvtoyuy2row_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
33 __asm {
34 push ebp
35 push edi
36 push esi
37 push ebx
39 mov edi, [esp+20] // dst
40 mov ebp, [esp+24] // srcy
41 mov ebx, [esp+28] // srcu
42 mov esi, [esp+32] // srcv
43 mov ecx, [esp+36] // width
45 shr ecx, 3
47 yuvtoyuy2row_loop:
49 movd mm0, [ebx]
50 punpcklbw mm0, [esi]
52 movq mm1, [ebp]
53 movq mm2, mm1
54 punpcklbw mm1, mm0
55 punpckhbw mm2, mm0
57 movq [edi], mm1
58 movq [edi+8], mm2
60 add ebp, 8
61 add ebx, 4
62 add esi, 4
63 add edi, 16
65 dec ecx
66 jnz yuvtoyuy2row_loop
68 pop ebx
69 pop esi
70 pop edi
71 pop ebp
72 ret
76 void __declspec(naked) yuvtoyuy2row_avg_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
78 static const __int64 mask = 0x7f7f7f7f7f7f7f7fi64;
80 __asm {
81 push ebp
82 push edi
83 push esi
84 push ebx
86 movq mm7, mask
88 mov edi, [esp+20] // dst
89 mov ebp, [esp+24] // srcy
90 mov ebx, [esp+28] // srcu
91 mov esi, [esp+32] // srcv
92 mov ecx, [esp+36] // width
93 mov eax, [esp+40] // pitchuv
95 shr ecx, 3
97 yuvtoyuy2row_avg_loop:
99 movd mm0, [ebx]
100 punpcklbw mm0, [esi]
101 movq mm1, mm0
103 movd mm2, [ebx + eax]
104 punpcklbw mm2, [esi + eax]
105 movq mm3, mm2
107 // (x+y)>>1 == (x&y)+((x^y)>>1)
109 pand mm0, mm2
110 pxor mm1, mm3
111 psrlq mm1, 1
112 pand mm1, mm7
113 paddb mm0, mm1
115 movq mm1, [ebp]
116 movq mm2, mm1
117 punpcklbw mm1, mm0
118 punpckhbw mm2, mm0
120 movq [edi], mm1
121 movq [edi+8], mm2
123 add ebp, 8
124 add ebx, 4
125 add esi, 4
126 add edi, 16
128 dec ecx
129 jnz yuvtoyuy2row_avg_loop
131 pop ebx
132 pop esi
133 pop edi
134 pop ebp
139 void __declspec(naked) yv12_yuy2_row_sse2() {
140 __asm {
141 // ebx - Y
142 // edx - U
143 // esi - V
144 // edi - dest
145 // ecx - halfwidth
146 xor eax, eax
148 one:
149 movdqa xmm0, [ebx + eax*2] // YYYYYYYY
150 movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY
152 movdqa xmm2, [edx + eax] // UUUUUUUU
153 movdqa xmm3, [esi + eax] // VVVVVVVV
155 movdqa xmm4, xmm2
156 movdqa xmm5, xmm0
157 movdqa xmm6, xmm1
158 punpcklbw xmm2, xmm3 // VUVUVUVU
159 punpckhbw xmm4, xmm3 // VUVUVUVU
161 punpcklbw xmm0, xmm2 // VYUYVYUY
162 punpcklbw xmm1, xmm4
163 punpckhbw xmm5, xmm2
164 punpckhbw xmm6, xmm4
166 movntdq [edi + eax*4], xmm0
167 movntdq [edi + eax*4 + 16], xmm5
168 movntdq [edi + eax*4 + 32], xmm1
169 movntdq [edi + eax*4 + 48], xmm6
171 add eax, 16
172 cmp eax, ecx
174 jb one
180 void __declspec(naked) yv12_yuy2_row_sse2_linear() {
181 __asm {
182 // ebx - Y
183 // edx - U
184 // esi - V
185 // edi - dest
186 // ecx - width
187 // ebp - uv_stride
188 xor eax, eax
190 one:
191 movdqa xmm0, [ebx + eax*2] // YYYYYYYY
192 movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY
194 movdqa xmm2, [edx]
195 movdqa xmm3, [esi]
196 pavgb xmm2, [edx + ebp] // UUUUUUUU
197 pavgb xmm3, [esi + ebp] // VVVVVVVV
199 movdqa xmm4, xmm2
200 movdqa xmm5, xmm0
201 movdqa xmm6, xmm1
202 punpcklbw xmm2, xmm3 // VUVUVUVU
203 punpckhbw xmm4, xmm3 // VUVUVUVU
205 punpcklbw xmm0, xmm2 // VYUYVYUY
206 punpcklbw xmm1, xmm4
207 punpckhbw xmm5, xmm2
208 punpckhbw xmm6, xmm4
210 movntdq [edi + eax*4], xmm0
211 movntdq [edi + eax*4 + 16], xmm5
212 movntdq [edi + eax*4 + 32], xmm1
213 movntdq [edi + eax*4 + 48], xmm6
215 add eax, 16
216 add edx, 16
217 add esi, 16
218 cmp eax, ecx
220 jb one
226 void __declspec(naked) yv12_yuy2_row_sse2_linear_interlaced() {
227 __asm {
228 // ebx - Y
229 // edx - U
230 // esi - V
231 // edi - dest
232 // ecx - width
233 // ebp - uv_stride
234 xor eax, eax
236 one:
237 movdqa xmm0, [ebx + eax*2] // YYYYYYYY
238 movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY
240 movdqa xmm2, [edx]
241 movdqa xmm3, [esi]
242 pavgb xmm2, [edx + ebp*2] // UUUUUUUU
243 pavgb xmm3, [esi + ebp*2] // VVVVVVVV
245 movdqa xmm4, xmm2
246 movdqa xmm5, xmm0
247 movdqa xmm6, xmm1
248 punpcklbw xmm2, xmm3 // VUVUVUVU
249 punpckhbw xmm4, xmm3 // VUVUVUVU
251 punpcklbw xmm0, xmm2 // VYUYVYUY
252 punpcklbw xmm1, xmm4
253 punpckhbw xmm5, xmm2
254 punpckhbw xmm6, xmm4
256 movntdq [edi + eax*4], xmm0
257 movntdq [edi + eax*4 + 16], xmm5
258 movntdq [edi + eax*4 + 32], xmm1
259 movntdq [edi + eax*4 + 48], xmm6
261 add eax, 16
262 add edx, 16
263 add esi, 16
264 cmp eax, ecx
266 jb one
272 void __declspec(naked) yv12_yuy2_sse2(const BYTE *Y, const BYTE *U, const BYTE *V,
273 int halfstride, unsigned halfwidth, unsigned height,
274 BYTE *YUY2, int d_stride)
276 __asm {
277 push ebx
278 push esi
279 push edi
280 push ebp
282 mov ebx, [esp + 20] // Y
283 mov edx, [esp + 24] // U
284 mov esi, [esp + 28] // V
285 mov edi, [esp + 44] // D
286 mov ebp, [esp + 32] // uv_stride
287 mov ecx, [esp + 36] // uv_width
289 mov eax, ecx
290 add eax, 15
291 and eax, 0xfffffff0
292 sub [esp + 32], eax
294 cmp dword ptr [esp + 40], 2
295 jbe last2
297 row:
298 sub dword ptr [esp + 40], 2
299 call yv12_yuy2_row_sse2
301 lea ebx, [ebx + ebp*2]
302 add edi, [esp + 48]
304 call yv12_yuy2_row_sse2_linear
306 add edx, [esp + 32]
307 add esi, [esp + 32]
309 lea ebx, [ebx + ebp*2]
310 add edi, [esp + 48]
312 cmp dword ptr [esp + 40], 2
313 ja row
315 last2:
316 call yv12_yuy2_row_sse2
318 dec dword ptr [esp + 40]
319 jz done
321 lea ebx, [ebx + ebp*2]
322 add edi, [esp + 48]
323 call yv12_yuy2_row_sse2
324 done:
326 pop ebp
327 pop edi
328 pop esi
329 pop ebx
335 void __declspec(naked) yv12_yuy2_sse2_interlaced(const BYTE *Y, const BYTE *U, const BYTE *V,
336 int halfstride, unsigned halfwidth, unsigned height,
337 BYTE *YUY2, int d_stride)
339 __asm {
340 push ebx
341 push esi
342 push edi
343 push ebp
345 mov ebx, [esp + 20] // Y
346 mov edx, [esp + 24] // U
347 mov esi, [esp + 28] // V
348 mov edi, [esp + 44] // D
349 mov ebp, [esp + 32] // uv_stride
350 mov ecx, [esp + 36] // uv_width
352 mov eax, ecx
353 add eax, 15
354 and eax, 0xfffffff0
355 sub [esp + 32], eax
357 cmp dword ptr [esp + 40], 4
358 jbe last4
360 row:
361 sub dword ptr [esp + 40], 4
362 call yv12_yuy2_row_sse2 // first row, first field
364 lea ebx, [ebx + ebp*2]
365 add edi, [esp + 48]
367 add edx, ebp
368 add esi, ebp
370 call yv12_yuy2_row_sse2 // first row, second field
372 lea ebx, [ebx + ebp*2]
373 add edi, [esp + 48]
375 sub edx, ebp
376 sub esi, ebp
378 call yv12_yuy2_row_sse2_linear_interlaced // second row, first field
380 add edx, [esp + 32]
381 add esi, [esp + 32]
383 lea ebx, [ebx + ebp*2]
384 add edi, [esp + 48]
386 call yv12_yuy2_row_sse2_linear_interlaced // second row, second field
388 add edx, [esp + 32]
389 add esi, [esp + 32]
391 lea ebx, [ebx + ebp*2]
392 add edi, [esp + 48]
394 cmp dword ptr [esp + 40], 4
395 ja row
397 last4:
398 call yv12_yuy2_row_sse2
400 lea ebx, [ebx + ebp*2]
401 add edi, [esp + 48]
403 add edx, ebp
404 add esi, ebp
406 call yv12_yuy2_row_sse2
408 lea ebx, [ebx + ebp*2]
409 add edi, [esp + 48]
411 sub edx, ebp
412 sub esi, ebp
414 call yv12_yuy2_row_sse2
416 lea ebx, [ebx + ebp*2]
417 add edi, [esp + 48]
419 add edx, ebp
420 add esi, ebp
422 call yv12_yuy2_row_sse2
424 pop ebp
425 pop edi
426 pop esi
427 pop ebx
432 #endif