X64 transport [Part 2](we do not(?) need _mm_empty in x64 build)
[xy_vsfilter.git] / src / subpic / MemSubPic.cpp
blob2c6e27f0e92a02db8a529019bd06789369181405
1 /*
2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
22 #include "stdafx.h"
23 #include "MemSubPic.h"
24 #include "color_conv_table.h"
26 #if 0
27 #include <fstream>
29 // debug functions
30 //
31 static void SaveRect2File(const CRect& cRect, const char * filename)
33 std::ofstream os(filename);
34 os<<cRect.left<<","<<cRect.top<<","<<cRect.right<<","<<cRect.bottom;
36 static void SaveAxxx2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
38 std::ofstream axxx(filename);
39 int w = cRect.Width(), h = cRect.Height();
41 BYTE* top = (BYTE*)spd.bits + spd.pitch*cRect.top + cRect.left*4;
42 BYTE* bottom = top + spd.pitch*h;
44 for(; top < bottom ; top += spd.pitch) {
45 BYTE* s = top;
46 BYTE* e = s + w*4;
47 for(; s < e; s+=4) { // ARGB ARGB -> AxYU AxYV
48 axxx<<(int)s[0]<<","<<(int)s[1]<<","<<(int)s[2]<<","<<(int)s[3];
49 if(s+4>=e)
51 axxx<<std::endl;
53 else
55 axxx<<",";
59 axxx.close();
61 static void SaveArgb2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
63 SaveAxxx2File(spd, cRect, filename);
65 static void SaveAyuv2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
67 SaveAxxx2File(spd, cRect, filename);
69 static void SaveNvxx2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
71 std::ofstream os(filename);
72 int w = cRect.Width(), h = cRect.Height();
74 BYTE* top = (BYTE*)spd.bits;
75 BYTE* bottom = top + spd.pitch*h;
77 for(; top < bottom ; top += spd.pitch) {
78 BYTE* s = top;
79 BYTE* e = s + w;
81 BYTE* sY = s + spd.pitch*spd.h;
82 BYTE* sU = sY + spd.pitch*spd.h;
83 BYTE* sV = sU + 1;
84 for(; s < e; s++, sY++, sU+=2,sV+=2) {
85 os<<(int)s[0]<<","<<(int)sY[0]<<","<<(int)sU[0]<<","<<(int)sV[0];
86 if(s+1>=e)
88 os<<std::endl;
90 else
92 os<<",";
96 os.close();
99 #define ONCER(expr) {\
100 static bool entered=false;\
101 if(!entered)\
103 entered=true;\
104 expr;\
107 #else
108 #define ONCER(expr)
109 #endif
112 // alpha blend functions
114 #include "xy_intrinsics.h"
115 #include "../dsutil/vd.h"
117 #ifndef _WIN64
118 static void AlphaBlt_YUY2_MMX(int w, int h, BYTE* d, int dstpitch, PCUINT8 s, int srcpitch)
120 for(int j = 0; j < h; j++, s += srcpitch, d += dstpitch)
122 unsigned int ia, c;
123 PCUINT8 s2 = s;
124 PCUINT8 s2end = s2 + w*4;
125 DWORD* d2 = (DWORD*)d;
126 ASSERT(w>0);
127 int last_a = w>0?s2[3]:0;
128 for(; s2 < s2end; s2 += 8, d2++)
130 ia = (last_a + 2*s2[3] + s2[7])>>2;
131 last_a = s2[7];
132 if(ia < 0xff)
134 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
135 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
136 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
137 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
138 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
140 ia = (ia<<24)|(s2[7]<<16)|(ia<<8)|s2[3];
141 c = (s2[4]<<24)|(s2[5]<<16)|(s2[0]<<8)|s2[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
142 __asm
144 mov edi, d2
145 pxor mm0, mm0
146 movd mm2, c
147 punpcklbw mm2, mm0
148 movd mm3, [edi]
149 punpcklbw mm3, mm0
150 movd mm4, ia
151 punpcklbw mm4, mm0
152 psraw mm4, 1 //or else, overflow because psraw shift in sign bit
153 pmullw mm3, mm4
154 psraw mm3, 7
155 paddsw mm3, mm2
156 packuswb mm3, mm3
157 movd [edi], mm3
162 _mm_empty();
164 #endif
166 void AlphaBlt_YUY2_C(int w, int h, BYTE* d, int dstpitch, PCUINT8 s, int srcpitch)
168 for(int j = 0; j < h; j++, s += srcpitch, d += dstpitch)
170 DWORD ia;
171 PCUINT8 s2 = s;
172 PCUINT8 s2end = s2 + w*4;
173 DWORD* d2 = (DWORD*)d;
174 ASSERT(w>0);
175 int last_a = w>0?s2[3]:0;
176 for(; s2 < s2end; s2 += 8, d2++)
178 ia = (last_a + 2*s2[3] + s2[7])>>2;
179 last_a = s2[7];
180 if(ia < 0xff)
182 DWORD y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
183 DWORD u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
184 DWORD y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
185 DWORD v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
186 *d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
194 // CMemSubPic
197 CMemSubPic::CMemSubPic(SubPicDesc& spd, int alpha_blt_dst_type)
198 : m_spd(spd), m_alpha_blt_dst_type(alpha_blt_dst_type)
200 m_maxsize.SetSize(spd.w, spd.h);
201 // m_rcDirty.SetRect(0, 0, spd.w, spd.h);
202 CRect allSpd(0,0,spd.w, spd.h);
203 m_rectListDirty.AddTail(allSpd);
206 CMemSubPic::~CMemSubPic()
208 delete [] m_spd.bits, m_spd.bits = NULL;
211 // ISubPic
213 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
215 return (void*)&m_spd;
218 STDMETHODIMP CMemSubPic::GetDesc(SubPicDesc& spd) const
220 spd.type = m_spd.type;
221 spd.w = m_size.cx;
222 spd.h = m_size.cy;
223 spd.bpp = m_spd.bpp;
224 spd.pitch = m_spd.pitch;
225 spd.bits = m_spd.bits;
226 spd.bitsU = m_spd.bitsU;
227 spd.bitsV = m_spd.bitsV;
228 spd.vidrect = m_vidrect;
229 return S_OK;
232 STDMETHODIMP CMemSubPic::CopyTo(ISubPicEx* pSubPic)
234 HRESULT hr;
235 if(FAILED(hr = __super::CopyTo(pSubPic))) {
236 return hr;
239 SubPicDesc src, dst;
240 if(FAILED(GetDesc(src)) || FAILED(pSubPic->GetDesc(dst))) {
241 return E_FAIL;
243 while(!m_rectListDirty.IsEmpty())
245 CRect& cRect = m_rectListDirty.GetHead();
246 int w = cRect.Width(), h = cRect.Height();
247 BYTE* s = (BYTE*)src.bits + src.pitch*cRect.top + cRect.left*4;
248 BYTE* d = (BYTE*)dst.bits + dst.pitch*cRect.top + cRect.left*4;
249 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
250 memcpy(d, s, w*4);
252 return S_OK;
255 STDMETHODIMP CMemSubPic::ClearDirtyRect(DWORD color)
257 if(m_rectListDirty.IsEmpty()) {
258 return S_OK;
260 while(!m_rectListDirty.IsEmpty())
262 //pDirtyRect = m_rectListDirty.RemoveHead();
263 CRect& dirtyRect = m_rectListDirty.RemoveTail();
264 BYTE* p = (BYTE*)m_spd.bits + m_spd.pitch*(dirtyRect.top) + dirtyRect.left*(m_spd.bpp>>3);
265 int w = dirtyRect.Width();
266 if(m_spd.type!=MSP_AYUV_PLANAR)
268 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
270 #ifdef _WIN64
271 memsetd(p, color, w*4); // nya
272 #else
273 __asm
275 mov eax, color
276 mov ecx, w
277 mov edi, p
279 rep stosd
282 #endif
285 else
287 ///TODO:
288 ///FIX ME
289 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
291 // memsetd(p, 0, m_rcDirty.Width());
292 //DbgLog((LOG_TRACE, 3, "w:%d", w));
293 //w = pDirtyRect->Width();
294 memset(p, 0xFF, w);
295 memset(p+m_spd.h*m_spd.pitch, 0, w);
296 memset(p+m_spd.h*m_spd.pitch*2, 0, w);
297 memset(p+m_spd.h*m_spd.pitch*3, 0, w);
301 m_rectListDirty.RemoveAll();
302 return S_OK;
305 STDMETHODIMP CMemSubPic::Lock(SubPicDesc& spd)
307 return GetDesc(spd);
310 STDMETHODIMP CMemSubPic::Unlock( CAtlList<CRect>* dirtyRectList )
312 int src_type = m_spd.type;
313 int dst_type = m_alpha_blt_dst_type;
314 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
315 dst_type == MSP_RGB24 ||
316 dst_type == MSP_RGB16 ||
317 dst_type == MSP_RGB15))
319 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
321 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
323 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
324 dst_type == MSP_YV12 ||
325 dst_type == MSP_P010 ||
326 dst_type == MSP_P016 ||
327 dst_type == MSP_NV12 ||
328 dst_type == MSP_NV21)))
330 return UnlockOther(dirtyRectList);
332 else if(src_type==MSP_RGBA && (dst_type == MSP_YUY2 ||
333 dst_type == MSP_AYUV || //ToDo: fix me MSP_AYUV
334 dst_type == MSP_IYUV ||
335 dst_type == MSP_YV12 ||
336 dst_type == MSP_NV12 ||
337 dst_type == MSP_NV21 ||
338 dst_type == MSP_P010 ||
339 dst_type == MSP_P016))
341 return UnlockRGBA_YUV(dirtyRectList);
343 return E_NOTIMPL;
346 HRESULT CMemSubPic::UnlockOther(CAtlList<CRect>* dirtyRectList)
348 SetDirtyRectEx(dirtyRectList);
349 if(m_rectListDirty.IsEmpty()) {
350 return S_OK;
353 POSITION pos = m_rectListDirty.GetHeadPosition();
354 while(pos!=NULL)
356 const CRect& cRect = m_rectListDirty.GetNext(pos);
357 int w = cRect.Width(), h = cRect.Height();
358 if (w<=0 || h<=0)
360 continue;
363 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*(cRect.top) + cRect.left*4;
364 BYTE* bottom = top + m_spd.pitch*h;
365 if(m_alpha_blt_dst_type == MSP_RGB16)
367 for(; top < bottom ; top += m_spd.pitch)
369 DWORD* s = (DWORD*)top;
370 DWORD* e = s + w;
371 for(; s < e; s++)
373 *s = ((*s>>3)&0x1f000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
374 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
378 else if(m_alpha_blt_dst_type == MSP_RGB15)
380 for(; top < bottom; top += m_spd.pitch)
382 DWORD* s = (DWORD*)top;
383 DWORD* e = s + w;
384 for(; s < e; s++)
386 *s = ((*s>>3)&0x1f000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
387 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
391 else if(m_alpha_blt_dst_type == MSP_YUY2)
393 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top, m_spd.pitch*(h-1)) );
395 for(BYTE* tempTop=top; tempTop < bottom ; tempTop += m_spd.pitch)
397 BYTE* s = tempTop;
398 BYTE* e = s + w*4;
399 BYTE last_v = s[0], last_u=s[2];
400 for(; s < e; s+=8) // AUYV AUYV -> AxYU AxYV
402 BYTE tmp = s[4];
403 s[4] = (last_v + 2*s[0] + s[4] + 2)>>2;
404 last_v = tmp;
406 s[0] = (last_u + 2*s[2] + s[6] + 2)>>2;
407 last_u = s[6];
411 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top, m_spd.pitch*(h-1)) );
413 else if(m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV )
415 //nothing to do
417 else if ( m_alpha_blt_dst_type == MSP_P010 || m_alpha_blt_dst_type == MSP_P016
418 || m_alpha_blt_dst_type == MSP_NV12 )
420 SubsampleAndInterlace(cRect, true);
422 else if( m_alpha_blt_dst_type == MSP_NV21 )
424 SubsampleAndInterlace(cRect, false);
427 return S_OK;
430 HRESULT CMemSubPic::UnlockRGBA_YUV(CAtlList<CRect>* dirtyRectList)
432 //debug
433 ONCER( SaveRect2File(dirtyRectList->GetHead(), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.rect") );
434 ONCER( SaveArgb2File(m_spd, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.argb") );
436 SetDirtyRectEx(dirtyRectList);
438 ONCER( SaveRect2File(dirtyRectList->GetHead(), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.rect2") );
439 if(m_rectListDirty.IsEmpty()) {
440 return S_OK;
443 POSITION pos = m_rectListDirty.GetHeadPosition();
444 while(pos!=NULL)
446 const CRect& cRect = m_rectListDirty.GetNext(pos);
447 int w = cRect.Width(), h = cRect.Height();
448 if(w<=0 || h<=0)
450 continue;
453 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*cRect.top + cRect.left*4;
454 BYTE* bottom = top + m_spd.pitch*h;
456 if( m_alpha_blt_dst_type == MSP_YUY2 ||
457 m_alpha_blt_dst_type == MSP_YV12 ||
458 m_alpha_blt_dst_type == MSP_IYUV ||
459 m_alpha_blt_dst_type == MSP_P010 ||
460 m_alpha_blt_dst_type == MSP_P016 ||
461 m_alpha_blt_dst_type == MSP_NV12 ||
462 m_alpha_blt_dst_type == MSP_NV21) {
463 for(; top < bottom ; top += m_spd.pitch) {
464 BYTE* s = top;
465 BYTE* e = s + w*4;
466 DWORD last_yuv = ColorConvTable::PreMulArgb2Ayuv(s[3], s[2], s[1], s[0]);
467 for(; s < e; s+=8) { // ARGB ARGB -> AxYU AxYV
468 if((s[3]+s[7]+(last_yuv>>24)) < 0xff*3) {
469 DWORD tmp1 = ColorConvTable::PreMulArgb2Ayuv(s[3], s[2], s[1], s[0]);
470 DWORD tmp2 = ColorConvTable::PreMulArgb2Ayuv(s[7], s[6], s[5], s[4]);
472 s[1] = (tmp1>>16)&0xff;
473 s[5] = (tmp2>>16)&0xff;
475 s[0] = (((last_yuv>>8)&0xff) + 2*((tmp1>>8)&0xff) + ((tmp2>>8)&0xff) + 2)/4;
476 s[4] = ((last_yuv&0xff) + 2*(tmp1&0xff) + (tmp2&0xff) + 2)/4;
477 last_yuv = tmp2;
478 } else {
479 last_yuv = ColorConvTable::PreMulArgb2Ayuv(s[7], s[6], s[5], s[4]);
481 s[1] = s[5] = 0;
482 s[0] = s[4] = 0;
487 else if(m_alpha_blt_dst_type == MSP_AYUV) {
488 for(; top < bottom ; top += m_spd.pitch) {
489 BYTE* s = top;
490 BYTE* e = s + w*4;
491 for(; s < e; s+=4) { // ARGB -> AYUV
492 if(s[3] < 0xff) {
493 *((DWORD*)s) = ColorConvTable::PreMulArgb2Ayuv(s[3], s[2], s[1], s[0]);
494 } else {
495 s[0] = s[1] = 0;
496 s[2] = 0;
503 ONCER( SaveAxxx2File(m_spd, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.axuv") );
504 return S_OK;
507 void CMemSubPic::SubsampleAndInterlace( const CRect& cRect, bool u_first )
509 //fix me: check alignment and log error
510 int w = cRect.Width(), h = cRect.Height();
511 BYTE* u_plan = reinterpret_cast<BYTE*>(m_spd.bits) + m_spd.pitch*m_spd.h*2;
512 BYTE* u_start = u_plan + m_spd.pitch*(cRect.top)+ cRect.left;
513 BYTE* v_start = u_start + m_spd.pitch*m_spd.h;
514 BYTE* dst = u_start;
515 if(!u_first)
517 BYTE* tmp = v_start;
518 v_start = u_start;
519 u_start = tmp;
522 //Todo: fix me.
523 //Walkarround for alignment
524 if ( ((m_spd.pitch|w) &15) == 0 && (g_cpuid.m_flags & CCpuID::sse2) )
526 ASSERT(w%16==0);
527 SubsampleAndInterlace(dst, u_start, v_start, h, w, m_spd.pitch);
529 else
531 SubsampleAndInterlaceC(dst, u_start, v_start, h, w, m_spd.pitch);
535 STDMETHODIMP CMemSubPic::AlphaBlt( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
537 if(!pSrc || !pDst || !pTarget) {
538 return E_POINTER;
540 int src_type = m_spd.type;
541 int dst_type = pTarget->type;
543 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
544 dst_type == MSP_RGB24 ||
545 dst_type == MSP_RGB16 ||
546 dst_type == MSP_RGB15 ||
547 dst_type == MSP_RGBA ||
548 dst_type == MSP_YUY2 ||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
549 dst_type == MSP_AYUV ))
551 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
553 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
555 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
556 dst_type == MSP_YV12)) )
558 return AlphaBltOther(pSrc, pDst, pTarget);
560 else if ( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_NV12 ||
561 dst_type == MSP_NV21 ) )
563 return AlphaBltAnv12_Nv12(pSrc, pDst, pTarget);
566 else if( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_P010 ||
567 dst_type == MSP_P016 ) )
569 return AlphaBltAnv12_P010(pSrc, pDst, pTarget);
571 else if( src_type==MSP_RGBA && (dst_type == MSP_IYUV ||
572 dst_type == MSP_YV12))
574 return AlphaBltAxyuAxyv_Yv12(pSrc, pDst, pTarget);
576 else if( src_type==MSP_RGBA && (dst_type == MSP_NV12||
577 dst_type == MSP_NV21))
579 return AlphaBltAxyuAxyv_Nv12(pSrc, pDst, pTarget);
581 else if( src_type==MSP_RGBA && (dst_type == MSP_P010 ||
582 dst_type == MSP_P016))
584 return AlphaBltAxyuAxyv_P010(pSrc, pDst, pTarget);
586 return E_NOTIMPL;
589 HRESULT CMemSubPic::AlphaBltOther(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
591 const SubPicDesc& src = m_spd;
592 SubPicDesc dst = *pTarget; // copy, because we might modify it
594 CRect rs(*pSrc), rd(*pDst);
595 if(dst.h < 0)
597 dst.h = -dst.h;
598 rd.bottom = dst.h - rd.bottom;
599 rd.top = dst.h - rd.top;
601 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
602 return E_INVALIDARG;
604 int w = rs.Width(), h = rs.Height();
605 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);//rs.left*4
606 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + ((rd.left*dst.bpp)>>3);
607 if(rd.top > rd.bottom)
609 if(dst.type == MSP_RGB32 || dst.type == MSP_RGB24
610 || dst.type == MSP_RGB16 || dst.type == MSP_RGB15
611 || dst.type == MSP_YUY2 || dst.type == MSP_AYUV)
613 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*dst.bpp>>3);
615 else if(dst.type == MSP_YV12 || dst.type == MSP_IYUV)
617 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*8>>3);
619 else
621 return E_NOTIMPL;
623 dst.pitch = -dst.pitch;
625 DbgLog((LOG_TRACE, 5, TEXT("w=%d h=%d"), w, h));
626 switch(dst.type)
628 case MSP_RGBA:
629 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
631 BYTE* s2 = s;
632 BYTE* s2end = s2 + w*4;
633 DWORD* d2 = (DWORD*)d;
634 for(; s2 < s2end; s2 += 4, d2++)
636 if(s2[3] < 0xff)
638 DWORD bd =0x00000100 -( (DWORD) s2[3]);
639 DWORD B = ((*((DWORD*)s2)&0x000000ff)<<8)/bd;
640 DWORD V = ((*((DWORD*)s2)&0x0000ff00)/bd)<<8;
641 DWORD R = (((*((DWORD*)s2)&0x00ff0000)>>8)/bd)<<16;
642 *d2 = B | V | R
643 | (0xff000000-(*((DWORD*)s2)&0xff000000))&0xff000000;
647 break;
648 case MSP_RGB32:
649 case MSP_AYUV: //ToDo: fix me MSP_VUYA indeed?
650 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
652 BYTE* s2 = s;
653 BYTE* s2end = s2 + w*4;
654 DWORD* d2 = (DWORD*)d;
655 for(; s2 < s2end; s2 += 4, d2++)
657 #ifdef _WIN64
658 DWORD ia = 256-s2[3];
659 if(s2[3] < 0xff) {
660 *d2 = ((((*d2&0x00ff00ff)*s2[3])>>8) + (((*((DWORD*)s2)&0x00ff00ff)*ia)>>8)&0x00ff00ff)
661 | ((((*d2&0x0000ff00)*s2[3])>>8) + (((*((DWORD*)s2)&0x0000ff00)*ia)>>8)&0x0000ff00);
663 #else
664 if(s2[3] < 0xff)
666 *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
667 | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00);
669 #endif
672 break;
673 case MSP_RGB24:
674 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
676 BYTE* s2 = s;
677 BYTE* s2end = s2 + w*4;
678 BYTE* d2 = d;
679 for(; s2 < s2end; s2 += 4, d2 += 3)
681 if(s2[3] < 0xff)
683 d2[0] = ((d2[0]*s2[3])>>8) + s2[0];
684 d2[1] = ((d2[1]*s2[3])>>8) + s2[1];
685 d2[2] = ((d2[2]*s2[3])>>8) + s2[2];
689 break;
690 case MSP_RGB16:
691 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
693 BYTE* s2 = s;
694 BYTE* s2end = s2 + w*4;
695 WORD* d2 = (WORD*)d;
696 for(; s2 < s2end; s2 += 4, d2++)
698 if(s2[3] < 0x1f)
700 *d2 = (WORD)((((((*d2&0xf81f)*s2[3])>>5) + (*(DWORD*)s2&0xf81f))&0xf81f)
701 | (((((*d2&0x07e0)*s2[3])>>5) + (*(DWORD*)s2&0x07e0))&0x07e0));
702 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
703 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
704 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
709 break;
710 case MSP_RGB15:
711 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
713 BYTE* s2 = s;
714 BYTE* s2end = s2 + w*4;
715 WORD* d2 = (WORD*)d;
716 for(; s2 < s2end; s2 += 4, d2++)
718 if(s2[3] < 0x1f)
720 *d2 = (WORD)((((((*d2&0x7c1f)*s2[3])>>5) + (*(DWORD*)s2&0x7c1f))&0x7c1f)
721 | (((((*d2&0x03e0)*s2[3])>>5) + (*(DWORD*)s2&0x03e0))&0x03e0));
722 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
723 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
724 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
729 break;
730 case MSP_YUY2:
731 AlphaBlt_YUY2(w, h, d, dst.pitch, s, src.pitch);
732 break;
733 case MSP_YV12:
734 case MSP_IYUV:
736 //dst.pitch = abs(dst.pitch);
737 int h2 = h/2;
738 if(!dst.pitchUV)
740 dst.pitchUV = abs(dst.pitch)/2;
742 if(!dst.bitsU || !dst.bitsV)
744 dst.bitsU = (BYTE*)dst.bits + abs(dst.pitch)*dst.h;
745 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
746 if(dst.type == MSP_YV12)
748 BYTE* p = dst.bitsU;
749 dst.bitsU = dst.bitsV;
750 dst.bitsV = p;
753 BYTE* dd[2];
754 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
755 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
756 if(rd.top > rd.bottom)
758 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
759 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
760 dst.pitchUV = -dst.pitchUV;
763 BYTE* src_origin= (BYTE*)src.bits + src.pitch*rs.top + rs.left;
765 BYTE* ss[2];
766 ss[0] = src_origin + src.pitch*src.h*2;//U
767 ss[1] = src_origin + src.pitch*src.h*3;//V
769 AlphaBltYv12Luma( d, dst.pitch, w, h, src_origin + src.pitch*src.h, src_origin, src.pitch );
771 AlphaBltYv12Chroma( dd[0], dst.pitchUV, w, h2, ss[0], src_origin, src.pitch);
772 AlphaBltYv12Chroma( dd[1], dst.pitchUV, w, h2, ss[1], src_origin, src.pitch);
773 #ifndef _WIN64
774 // TODOX64 : fixme!
775 _mm_empty();
776 #endif
778 break;
779 default:
780 return E_NOTIMPL;
781 break;
784 //emmsÒª40¸öcpuÖÜÆÚ
785 //__asm emms;
786 return S_OK;
789 HRESULT CMemSubPic::AlphaBltAxyuAxyv_P010(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
791 const SubPicDesc& src = m_spd;
792 SubPicDesc dst = *pTarget; // copy, because we might modify it
794 CRect rs(*pSrc), rd(*pDst);
796 if(dst.h < 0) {
797 dst.h = -dst.h;
798 rd.bottom = dst.h - rd.bottom;
799 rd.top = dst.h - rd.top;
802 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
803 return E_INVALIDARG;
806 int w = rs.Width(), h = rs.Height();
809 BYTE* s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
810 BYTE* d = static_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
812 if(rd.top > rd.bottom) {
813 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
815 dst.pitch = -dst.pitch;
818 for(ptrdiff_t i=0; i<h; i++, s += src.pitch, d += dst.pitch)
820 BYTE* s2 = s;
821 BYTE* s2end = s2 + w*4;
822 WORD* d2 = reinterpret_cast<WORD*>(d);
823 for(; s2 < s2end; s2 += 4, d2++)
825 if(s2[3] < 0xff) {
826 d2[0] = ((d2[0]*s2[3])>>8) + (s2[1]<<8);
831 //UV
832 int h2 = h/2;
833 if(!dst.pitchUV)
835 dst.pitchUV = abs(dst.pitch);
837 if(!dst.bitsU || !dst.bitsV)
839 dst.bitsU = static_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
840 dst.bitsV = dst.bitsU + 2;
842 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
843 if(rd.top > rd.bottom)
845 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
846 dst.pitchUV = -dst.pitchUV;
849 s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
851 d = ddUV;
852 int pitch = src.pitch;
853 for(int j = 0; j < h2; j++, s += 2*src.pitch, d += dst.pitchUV )
855 BYTE* s2 = s;
856 WORD* d2=reinterpret_cast<WORD*>(d);
857 WORD* d2_end = reinterpret_cast<WORD*>(d+2*w);
858 DWORD last_alpha = s2[3]+s2[3+src.pitch];
859 for( ; d2<d2_end; s2+=8, d2+=2)
861 unsigned int ia = (
862 last_alpha +
863 (s2[3] + s2[3+src.pitch])*2 +
864 s2[3+4]+ s2[3+4+src.pitch]);
865 last_alpha = s2[3+4]+ s2[3+4+src.pitch];
866 if( ia!=0xFF*8 )
868 d2[0] = (((d2[0])*ia)>>11) + ((s2[0] + s2[0+src.pitch])<<7);
869 d2[1] = (((d2[1])*ia)>>11) + ((s2[4] + s2[4+src.pitch])<<7);
874 return S_OK;
877 HRESULT CMemSubPic::AlphaBltAxyuAxyv_Yv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
879 const SubPicDesc& src = m_spd;
880 SubPicDesc dst = *pTarget; // copy, because we might modify it
882 CRect rs(*pSrc), rd(*pDst);
884 if(dst.h < 0) {
885 dst.h = -dst.h;
886 rd.bottom = dst.h - rd.bottom;
887 rd.top = dst.h - rd.top;
890 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
891 return E_INVALIDARG;
894 int w = rs.Width(), h = rs.Height();
896 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
897 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
899 if(rd.top > rd.bottom) {
900 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
902 dst.pitch = -dst.pitch;
905 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
906 BYTE* s2 = s;
907 BYTE* s2end = s2 + w*4;
908 BYTE* d2 = d;
909 for(; s2 < s2end; s2 += 4, d2++) {
910 if(s2[3] < 0xff) {
911 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
915 dst.pitch = abs(dst.pitch);
917 int h2 = h/2;
919 if(!dst.pitchUV) {
920 dst.pitchUV = dst.pitch/2;
923 BYTE* ss[2];
924 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
925 ss[1] = ss[0] + 4;
927 if(!dst.bitsU || !dst.bitsV) {
928 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
929 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
931 if(dst.type == MSP_YV12) {
932 BYTE* p = dst.bitsU;
933 dst.bitsU = dst.bitsV;
934 dst.bitsV = p;
938 BYTE* dd[2];
939 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
940 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
942 if(rd.top > rd.bottom) {
943 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
944 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
945 dst.pitchUV = -dst.pitchUV;
948 for(ptrdiff_t i = 0; i < 2; i++) {
949 s = ss[i];
950 d = dd[i];
951 BYTE* a = ss[0]+3;
952 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, a += src.pitch*2) {
953 BYTE* s2 = s;
954 BYTE* s2end = s2 + w*4;
955 BYTE* d2 = d;
956 BYTE* a2 = a;
958 DWORD last_alpha = a2[0]+a2[0+src.pitch];
959 for(; s2 < s2end; s2 += 8, d2++, a2 += 8) {
960 unsigned int ia = (last_alpha + 2*(a2[0]+a2[0+src.pitch]) + a2[4] + a2[4+src.pitch] + 4 )>>3;
961 last_alpha = a2[4] + a2[4+src.pitch];
962 if(ia < 0xff) {
963 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
969 return S_OK;
972 HRESULT CMemSubPic::AlphaBltAxyuAxyv_Nv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
974 ONCER( SaveArgb2File(*pTarget, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.nv12") );
975 const SubPicDesc& src = m_spd;
976 SubPicDesc dst = *pTarget; // copy, because we might modify it
978 CRect rs(*pSrc), rd(*pDst);
980 if(dst.h < 0) {
981 dst.h = -dst.h;
982 rd.bottom = dst.h - rd.bottom;
983 rd.top = dst.h - rd.top;
986 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
987 return E_INVALIDARG;
990 int w = rs.Width(), h = rs.Height();
992 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
993 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
995 if(rd.top > rd.bottom) {
996 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
998 dst.pitch = -dst.pitch;
1001 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1002 BYTE* s2 = s;
1003 BYTE* s2end = s2 + w*4;
1004 BYTE* d2 = d;
1005 for(; s2 < s2end; s2 += 4, d2++) {
1006 if(s2[3] < 0xff) {
1007 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1011 dst.pitch = abs(dst.pitch);
1013 int h2 = h/2;
1015 if(!dst.pitchUV) {
1016 dst.pitchUV = dst.pitch;
1019 BYTE* ss[2];
1020 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1021 ss[1] = ss[0] + 4;
1023 if(!dst.bitsU || !dst.bitsV) {
1024 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1025 dst.bitsV = dst.bitsU + 1;
1027 if(dst.type == MSP_NV21) {
1028 BYTE* p = dst.bitsU;
1029 dst.bitsU = dst.bitsV;
1030 dst.bitsV = p;
1034 BYTE* dd[2];
1035 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1036 dd[1] = dd[0]+1;
1038 if(rd.top > rd.bottom) {
1039 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1040 dd[1] = dd[0]+1;
1041 dst.pitchUV = -dst.pitchUV;
1044 for(ptrdiff_t i = 0; i < 2; i++) {
1045 s = ss[i];
1046 d = dd[i];
1047 BYTE* a = ss[0]+3;
1048 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, a += src.pitch*2) {
1049 BYTE* s2 = s;
1050 BYTE* s2end = s2 + w*4;
1051 BYTE* d2 = d;
1052 BYTE* a2 = a;
1053 DWORD last_alpha = a2[0]+a2[0+src.pitch];
1054 for(; s2 < s2end; s2 += 8, d2+=2, a2 += 8) {
1055 unsigned int ia = (last_alpha+2*(a2[0]+a2[0+src.pitch])+a2[4]+a2[4+src.pitch]+4)>>3;
1056 last_alpha = a2[4]+a2[4+src.pitch];
1057 if(ia < 0xff) {
1058 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1064 ONCER( SaveArgb2File(*pTarget, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.nv12_2") );
1065 return S_OK;
1068 HRESULT CMemSubPic::AlphaBltAnv12_P010( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1070 //fix me: check colorspace and log error
1071 const SubPicDesc& src = m_spd;
1072 SubPicDesc dst = *pTarget; // copy, because we might modify it
1074 CRect rs(*pSrc), rd(*pDst);
1075 if(dst.h < 0)
1077 dst.h = -dst.h;
1078 rd.bottom = dst.h - rd.bottom;
1079 rd.top = dst.h - rd.top;
1081 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1082 return E_INVALIDARG;
1084 int w = rs.Width(), h = rs.Height();
1085 bool bottom_down = rd.top > rd.bottom;
1087 BYTE* d = NULL;
1088 BYTE* dUV = NULL;
1089 if(!bottom_down)
1091 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1092 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*rd.top/2 + rd.left*2;
1094 else
1096 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left*2;
1097 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*(rd.top/2-1) + rd.left*2;
1098 dst.pitch = -dst.pitch;
1100 ASSERT(dst.pitchUV==0 || dst.pitchUV==abs(dst.pitch));
1102 const BYTE* sa = reinterpret_cast<const BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1103 const BYTE* sy = sa + src.pitch*src.h;
1104 const BYTE* s_uv = sy + src.pitch*src.h;//UV
1105 return AlphaBltAnv12_P010(sa, sy, s_uv, src.pitch, d, dUV, dst.pitch, w, h);
1108 HRESULT CMemSubPic::AlphaBltAnv12_Nv12( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1110 //fix me: check colorspace and log error
1111 const SubPicDesc& src = m_spd;
1112 SubPicDesc dst = *pTarget; // copy, because we might modify it
1114 CRect rs(*pSrc), rd(*pDst);
1115 if(dst.h < 0)
1117 dst.h = -dst.h;
1118 rd.bottom = dst.h - rd.bottom;
1119 rd.top = dst.h - rd.top;
1121 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1122 return E_INVALIDARG;
1124 int w = rs.Width(), h = rs.Height();
1125 bool bottom_down = rd.top > rd.bottom;
1127 BYTE* d = NULL;
1128 BYTE* dUV = NULL;
1129 if (!bottom_down)
1131 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left;
1132 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*rd.top/2 + rd.left;
1134 else
1136 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left;
1137 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*(rd.top/2-1) + rd.left;
1138 dst.pitch = -dst.pitch;
1140 ASSERT(dst.pitchUV==0 || dst.pitchUV==abs(dst.pitch));
1142 const BYTE* sa = reinterpret_cast<const BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1143 const BYTE* sy = sa + src.pitch*src.h;
1144 const BYTE* s_uv = sy + src.pitch*src.h;//UV
1146 return AlphaBltAnv12_Nv12(sa, sy, s_uv, src.pitch, d, dUV, dst.pitch, w, h);
1149 STDMETHODIMP CMemSubPic::SetDirtyRectEx(CAtlList<CRect>* dirtyRectList )
1151 //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1152 if(dirtyRectList!=NULL)
1154 POSITION pos = dirtyRectList->GetHeadPosition();
1155 if(m_spd.type == MSP_AYUV_PLANAR || m_alpha_blt_dst_type==MSP_IYUV || m_alpha_blt_dst_type==MSP_YV12
1156 || m_alpha_blt_dst_type==MSP_P010 || m_alpha_blt_dst_type==MSP_P016
1157 || m_alpha_blt_dst_type==MSP_NV12 || m_alpha_blt_dst_type==MSP_NV21 )
1159 while(pos!=NULL)
1161 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1162 cRectSrc.left &= ~15;
1163 cRectSrc.right = (cRectSrc.right+15)&~15;
1164 if(cRectSrc.right>m_spd.w)
1166 cRectSrc.right = m_spd.w;
1168 cRectSrc.top &= ~1;
1169 cRectSrc.bottom = (cRectSrc.bottom+1)&~1;
1172 else if(m_spd.type == MSP_XY_AUYV || m_alpha_blt_dst_type==MSP_YUY2)
1174 while(pos!=NULL)
1176 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1177 cRectSrc.left &= ~3;
1178 cRectSrc.right = (cRectSrc.right+3)&~3;
1182 return __super::SetDirtyRectEx(dirtyRectList);
1186 // static
1189 void CMemSubPic::AlphaBltYv12Luma(byte* dst, int dst_pitch,
1190 int w, int h,
1191 const byte* sub, const byte* alpha, int sub_pitch)
1193 if( (
1194 ((reinterpret_cast<intptr_t>(alpha) ^ reinterpret_cast<intptr_t>(sub))
1195 |(reinterpret_cast<intptr_t>(alpha) ^ reinterpret_cast<intptr_t>(dst))
1196 | static_cast<intptr_t>(sub_pitch)
1197 | static_cast<intptr_t>(dst_pitch) ) & 15 )==0
1198 && w > 32 && (g_cpuid.m_flags & CCpuID::sse2))
1200 int head = (16 - (reinterpret_cast<intptr_t>(alpha)&15))&15;
1201 int tail = (w-head) & 15;
1202 int w1 = w - head - tail;
1203 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
1205 const BYTE* sa = alpha;
1206 const BYTE* s2 = sub;
1207 const BYTE* s2end_mod16 = s2 + w1;
1208 const BYTE* s2end = s2 + w;
1209 BYTE* d2 = dst;
1211 for( ; (reinterpret_cast<intptr_t>(s2)&15) != 0; s2++, sa++, d2++)
1213 if(sa[0] < 0xff)
1215 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
1218 for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=16)
1220 pix_alpha_blend_yv12_luma_sse2(d2, sa, s2);
1222 for(; s2 < s2end; s2++, sa++, d2++)
1224 if(sa[0] < 0xff)
1226 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
1231 else //fix me: only a workaround for non-mod-16 size video
1233 CMemSubPic::AlphaBltYv12LumaC(dst, dst_pitch, w, h, sub, alpha, sub_pitch);
1237 void CMemSubPic::AlphaBltYv12LumaC( byte* dst, int dst_pitch, int w, int h, const byte* sub, const byte* alpha, int sub_pitch )
1239 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
1241 const BYTE* sa = alpha;
1242 const BYTE* s2 = sub;
1243 const BYTE* s2end = s2 + w;
1244 BYTE* d2 = dst;
1245 for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
1247 if(sa[0] < 0xff)
1249 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
1250 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
1256 void CMemSubPic::AlphaBltYv12Chroma(byte* dst_uv, int dst_pitch,
1257 int w, int chroma_h,
1258 const byte* src_uv, const byte* src_a, int src_pitch)
1260 if( (
1261 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_uv))
1262 |(reinterpret_cast<intptr_t>(src_a) ^ (2*reinterpret_cast<intptr_t>(dst_uv)))
1263 | static_cast<intptr_t>(src_pitch)
1264 | (2*static_cast<intptr_t>(dst_pitch)) ) & 15) ==0 &&
1265 w > 16 && (g_cpuid.m_flags & CCpuID::sse2))
1267 int head = (16 - (reinterpret_cast<intptr_t>(src_a)&15))&15;
1268 int tail = (w-head) & 15;
1269 int w00 = w - head - tail;
1271 int pitch = src_pitch;
1272 for(int j = 0; j < chroma_h; j++, src_uv += src_pitch*2, src_a += src_pitch*2, dst_uv += dst_pitch)
1274 hleft_vmid_mix_uv_yv12_c2(dst_uv, head, src_uv, src_a, src_pitch);
1275 hleft_vmid_mix_uv_yv12_sse2(dst_uv+(head>>1), w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0);
1276 hleft_vmid_mix_uv_yv12_c2(dst_uv+((head+w00)>>1), tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0);
1279 else//fix me: only a workaround for non-mod-16 size video
1281 AlphaBltYv12ChromaC(dst_uv, dst_pitch, w, chroma_h, src_uv, src_a, src_pitch);
1285 void CMemSubPic::AlphaBltYv12ChromaC( byte* dst, int dst_pitch, int w, int chroma_h, const byte* sub_chroma, const byte* alpha, int sub_pitch )
1287 for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
1289 hleft_vmid_mix_uv_yv12_c(dst, w, sub_chroma, alpha, sub_pitch);
1293 HRESULT CMemSubPic::AlphaBltAnv12_P010( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch,
1294 BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1296 if ( g_cpuid.m_flags & CCpuID::sse2 )
1298 const BYTE* sa = src_a;
1299 if( (
1300 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_y))
1301 |(reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(dst_y))
1302 | static_cast<intptr_t>(src_pitch)
1303 | static_cast<intptr_t>(dst_pitch) ) & 15 )==0 &&
1304 w > 32 )
1306 int head = (16 - reinterpret_cast<intptr_t>(src_a)&15)&15;
1307 int tail = (w - head) & 15;
1309 for(int i=0; i<h; i++, sa += src_pitch, src_y += src_pitch, dst_y += dst_pitch)
1311 const BYTE* sa2 = sa;
1312 const BYTE* s2 = src_y;
1313 const BYTE* s2end_mod16 = s2 + (w&~15);
1314 BYTE* d2 = dst_y;
1315 WORD* d_w=reinterpret_cast<WORD*>(dst_y);
1317 switch( head )//important: it is safe since w > 16
1319 case 15:
1320 #define _XY_MIX_ONE if(sa2[0] < 0xff) { d_w[0] = ((d_w[0]*sa2[0])>>8) + (s2[0]<<8); } sa2++;d_w++;s2++;
1321 _XY_MIX_ONE
1322 case 14:
1323 _XY_MIX_ONE
1324 case 13:
1325 _XY_MIX_ONE
1326 case 12:
1327 _XY_MIX_ONE
1328 case 11:
1329 _XY_MIX_ONE
1330 case 10:
1331 _XY_MIX_ONE
1332 case 9:
1333 _XY_MIX_ONE
1334 case 8:
1335 _XY_MIX_ONE
1336 case 7:
1337 _XY_MIX_ONE
1338 case 6:
1339 _XY_MIX_ONE
1340 case 5:
1341 _XY_MIX_ONE
1342 case 4:
1343 _XY_MIX_ONE
1344 case 3:
1345 _XY_MIX_ONE
1346 case 2:
1347 _XY_MIX_ONE
1348 case 1://fall through on purpose
1349 _XY_MIX_ONE
1351 for(; s2 < s2end_mod16; s2+=16, sa2+=16, d_w+=16)
1353 mix_16_y_p010_sse2( reinterpret_cast<BYTE*>(d_w), s2, sa2);
1355 switch( tail )//important: it is safe since w > 16
1357 case 15:
1358 _XY_MIX_ONE
1359 case 14:
1360 _XY_MIX_ONE
1361 case 13:
1362 _XY_MIX_ONE
1363 case 12:
1364 _XY_MIX_ONE
1365 case 11:
1366 _XY_MIX_ONE
1367 case 10:
1368 _XY_MIX_ONE
1369 case 9:
1370 _XY_MIX_ONE
1371 case 8:
1372 _XY_MIX_ONE
1373 case 7:
1374 _XY_MIX_ONE
1375 case 6:
1376 _XY_MIX_ONE
1377 case 5:
1378 _XY_MIX_ONE
1379 case 4:
1380 _XY_MIX_ONE
1381 case 3:
1382 _XY_MIX_ONE
1383 case 2:
1384 _XY_MIX_ONE
1385 case 1://fall through on purpose
1386 _XY_MIX_ONE
1390 else //fix me: only a workaround for non-mod-16 size video
1392 for(int i=0; i<h; i++, sa += src_pitch, src_y += src_pitch, dst_y += dst_pitch)
1394 const BYTE* sa2 = sa;
1395 const BYTE* s2 = src_y;
1396 const BYTE* s2end = s2 + w;
1397 WORD* d_w = reinterpret_cast<WORD*>(dst_y);
1398 for(; s2 < s2end; s2+=1, sa2+=1, d_w+=1)
1400 if(sa2[0] < 0xff)
1402 d_w[0] = ((d_w[0]*sa2[0])>>8) + (s2[0]<<8);
1407 //UV
1408 int h2 = h/2;
1409 BYTE* d = dst_uv;
1410 if( (
1411 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_uv))
1412 |(reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(dst_uv))
1413 | static_cast<intptr_t>(src_pitch)
1414 | static_cast<intptr_t>(dst_pitch) ) & 15) ==0 &&
1415 w > 16 )
1417 int head = (16-(reinterpret_cast<intptr_t>(src_a)&15))&15;
1418 int tail = (w-head) & 15;
1419 int w00 = w - head - tail;
1421 ASSERT(w>0);//the calls to mix may failed if w==0
1422 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1424 hleft_vmid_mix_uv_p010_c2(d, head, src_uv, src_a, src_pitch);
1425 hleft_vmid_mix_uv_p010_sse2(d+2*head, w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0);
1426 hleft_vmid_mix_uv_p010_c2(d+2*(head+w00), tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0);
1429 else
1431 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1433 hleft_vmid_mix_uv_p010_c(d, w, src_uv, src_a, src_pitch);
1436 #ifndef _WIN64
1437 // TODOX64 : fixme!
1438 _mm_empty();
1439 #endif
1440 return S_OK;
1442 else
1444 return AlphaBltAnv12_P010_C(src_a, src_y, src_uv, src_pitch, dst_y, dst_uv, dst_pitch, w, h);
1448 HRESULT CMemSubPic::AlphaBltAnv12_P010_C( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch, BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1450 const BYTE* sa = src_a;
1451 for(int i=0; i<h; i++, sa += src_pitch, src_y += src_pitch, dst_y += dst_pitch)
1453 const BYTE* sa2 = sa;
1454 const BYTE* s2 = src_y;
1455 const BYTE* s2end = s2 + w;
1456 WORD* d2 = reinterpret_cast<WORD*>(dst_y);
1457 for(; s2 < s2end; s2+=1, sa2+=1, d2+=1)
1459 if(sa2[0] < 0xff)
1461 d2[0] = ((d2[0]*sa2[0])>>8) + (s2[0]<<8);
1465 //UV
1466 int h2 = h/2;
1467 BYTE* d = dst_uv;
1468 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1470 hleft_vmid_mix_uv_p010_c(d, w, src_uv, src_a, src_pitch);
1472 return S_OK;
1475 HRESULT CMemSubPic::AlphaBltAnv12_Nv12( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch,
1476 BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1478 AlphaBltYv12Luma( dst_y, dst_pitch, w, h, src_y, src_a, src_pitch );
1480 int h2 = h/2;
1481 if( (
1482 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_uv))
1483 |(reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(dst_uv))
1484 | static_cast<intptr_t>(src_pitch)
1485 | static_cast<intptr_t>(dst_pitch) ) & 15) ==0 &&
1486 w > 16 && (g_cpuid.m_flags & CCpuID::sse2) )
1488 BYTE* d = dst_uv;
1490 int head = (16-(reinterpret_cast<intptr_t>(src_a)&15))&15;
1491 int tail = (w-head) & 15;
1492 int w00 = w - head - tail;
1494 ASSERT(w>0);//the calls to mix may failed if w==0
1495 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1497 hleft_vmid_mix_uv_nv12_c2(d, head, src_uv, src_a, src_pitch);
1498 hleft_vmid_mix_uv_nv12_sse2(d+head, w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0);
1499 hleft_vmid_mix_uv_nv12_c2(d+head+w00, tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0);
1501 #ifndef _WIN64
1502 // TODOX64 : fixme!
1503 _mm_empty();
1504 #endif
1506 else
1508 BYTE* d = dst_uv;
1509 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1511 hleft_vmid_mix_uv_nv12_c(d, w, src_uv, src_a, src_pitch);
1514 return S_OK;
1517 HRESULT CMemSubPic::AlphaBltAnv12_Nv12_C( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch, BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1519 AlphaBltYv12LumaC( dst_y, dst_pitch, w, h, src_y, src_a, src_pitch );
1520 int h2 = h/2;
1521 BYTE* d = dst_uv;
1522 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1524 hleft_vmid_mix_uv_nv12_c(d, w, src_uv, src_a, src_pitch);
1526 return S_OK;
1529 void CMemSubPic::SubsampleAndInterlace( BYTE* dst, const BYTE* u, const BYTE* v, int h, int w, int pitch )
1531 for (int i=0;i<h;i+=2)
1533 hleft_vmid_subsample_and_interlace_2_line_sse2(dst, u, v, w, pitch);
1534 u += 2*pitch;
1535 v += 2*pitch;
1536 dst += pitch;
1540 void CMemSubPic::SubsampleAndInterlaceC( BYTE* dst, const BYTE* u, const BYTE* v, int h, int w, int pitch )
1542 for (int i=0;i<h;i+=2)
1544 hleft_vmid_subsample_and_interlace_2_line_c(dst, u, v, w, pitch);
1545 u += 2*pitch;
1546 v += 2*pitch;
1547 dst += pitch;
1551 void CMemSubPic::AlphaBlt_YUY2(int w, int h, BYTE* d, int dstpitch, PCUINT8 s, int srcpitch)
1553 #ifdef _WIN64
1554 AlphaBlt_YUY2_C(w, h, d, dstpitch, s, srcpitch);
1555 #else
1556 AlphaBlt_YUY2_MMX(w, h, d, dstpitch, s, srcpitch);
1557 #endif
1561 // CMemSubPicAllocator
1564 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type, SIZE maxsize, int type/*=-1*/)
1565 : CSubPicExAllocatorImpl(maxsize, false, false)
1566 , m_alpha_blt_dst_type(alpha_blt_dst_type)
1567 , m_maxsize(maxsize)
1568 , m_type(type)
1570 if(m_type==-1)
1572 switch(alpha_blt_dst_type)
1574 case MSP_YUY2:
1575 m_type = MSP_XY_AUYV;
1576 break;
1577 case MSP_AYUV:
1578 m_type = MSP_AYUV;
1579 break;
1580 case MSP_IYUV:
1581 case MSP_YV12:
1582 case MSP_P010:
1583 case MSP_P016:
1584 case MSP_NV12:
1585 case MSP_NV21:
1586 m_type = MSP_AYUV_PLANAR;
1587 break;
1588 default:
1589 m_type = MSP_RGBA;
1590 break;
1595 // ISubPicAllocatorImpl
1597 bool CMemSubPicAllocator::AllocEx(bool fStatic, ISubPicEx** ppSubPic)
1599 if(!ppSubPic) {
1600 return false;
1602 SubPicDesc spd;
1603 spd.w = m_maxsize.cx;
1604 spd.h = m_maxsize.cy;
1605 spd.bpp = 32;
1606 spd.pitch = (spd.w*spd.bpp)>>3;
1607 spd.type = m_type;
1608 spd.bits = DNew BYTE[spd.pitch*spd.h];
1609 if(!spd.bits) {
1610 return false;
1612 *ppSubPic = DNew CMemSubPic(spd, m_alpha_blt_dst_type);
1613 if(!(*ppSubPic)) {
1614 return false;
1616 (*ppSubPic)->AddRef();
1617 return true;