X64 transport [Part 1] (Fix code for x64)
[xy_vsfilter.git] / src / subpic / MemSubPic.cpp
blob761763923c19feb212290a915f5676f3d988f57a
1 /*
2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
22 #include "stdafx.h"
23 #include "MemSubPic.h"
24 #include "color_conv_table.h"
26 #if 0
27 #include <fstream>
29 // debug functions
30 //
31 static void SaveRect2File(const CRect& cRect, const char * filename)
33 std::ofstream os(filename);
34 os<<cRect.left<<","<<cRect.top<<","<<cRect.right<<","<<cRect.bottom;
36 static void SaveAxxx2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
38 std::ofstream axxx(filename);
39 int w = cRect.Width(), h = cRect.Height();
41 BYTE* top = (BYTE*)spd.bits + spd.pitch*cRect.top + cRect.left*4;
42 BYTE* bottom = top + spd.pitch*h;
44 for(; top < bottom ; top += spd.pitch) {
45 BYTE* s = top;
46 BYTE* e = s + w*4;
47 for(; s < e; s+=4) { // ARGB ARGB -> AxYU AxYV
48 axxx<<(int)s[0]<<","<<(int)s[1]<<","<<(int)s[2]<<","<<(int)s[3];
49 if(s+4>=e)
51 axxx<<std::endl;
53 else
55 axxx<<",";
59 axxx.close();
61 static void SaveArgb2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
63 SaveAxxx2File(spd, cRect, filename);
65 static void SaveAyuv2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
67 SaveAxxx2File(spd, cRect, filename);
69 static void SaveNvxx2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
71 std::ofstream os(filename);
72 int w = cRect.Width(), h = cRect.Height();
74 BYTE* top = (BYTE*)spd.bits;
75 BYTE* bottom = top + spd.pitch*h;
77 for(; top < bottom ; top += spd.pitch) {
78 BYTE* s = top;
79 BYTE* e = s + w;
81 BYTE* sY = s + spd.pitch*spd.h;
82 BYTE* sU = sY + spd.pitch*spd.h;
83 BYTE* sV = sU + 1;
84 for(; s < e; s++, sY++, sU+=2,sV+=2) {
85 os<<(int)s[0]<<","<<(int)sY[0]<<","<<(int)sU[0]<<","<<(int)sV[0];
86 if(s+1>=e)
88 os<<std::endl;
90 else
92 os<<",";
96 os.close();
99 #define ONCER(expr) {\
100 static bool entered=false;\
101 if(!entered)\
103 entered=true;\
104 expr;\
107 #else
108 #define ONCER(expr)
109 #endif
112 // alpha blend functions
114 #include "xy_intrinsics.h"
115 #include "../dsutil/vd.h"
117 #ifndef _WIN64
118 static void AlphaBlt_YUY2_MMX(int w, int h, BYTE* d, int dstpitch, PCUINT8 s, int srcpitch)
120 for(int j = 0; j < h; j++, s += srcpitch, d += dstpitch)
122 unsigned int ia, c;
123 PCUINT8 s2 = s;
124 PCUINT8 s2end = s2 + w*4;
125 DWORD* d2 = (DWORD*)d;
126 ASSERT(w>0);
127 int last_a = w>0?s2[3]:0;
128 for(; s2 < s2end; s2 += 8, d2++)
130 ia = (last_a + 2*s2[3] + s2[7])>>2;
131 last_a = s2[7];
132 if(ia < 0xff)
134 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
135 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
136 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
137 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
138 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
140 ia = (ia<<24)|(s2[7]<<16)|(ia<<8)|s2[3];
141 c = (s2[4]<<24)|(s2[5]<<16)|(s2[0]<<8)|s2[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
142 __asm
144 mov edi, d2
145 pxor mm0, mm0
146 movd mm2, c
147 punpcklbw mm2, mm0
148 movd mm3, [edi]
149 punpcklbw mm3, mm0
150 movd mm4, ia
151 punpcklbw mm4, mm0
152 psraw mm4, 1 //or else, overflow because psraw shift in sign bit
153 pmullw mm3, mm4
154 psraw mm3, 7
155 paddsw mm3, mm2
156 packuswb mm3, mm3
157 movd [edi], mm3
162 _mm_empty();
164 #endif
166 void AlphaBlt_YUY2_C(int w, int h, BYTE* d, int dstpitch, PCUINT8 s, int srcpitch)
168 for(int j = 0; j < h; j++, s += srcpitch, d += dstpitch)
170 DWORD ia;
171 PCUINT8 s2 = s;
172 PCUINT8 s2end = s2 + w*4;
173 DWORD* d2 = (DWORD*)d;
174 ASSERT(w>0);
175 int last_a = w>0?s2[3]:0;
176 for(; s2 < s2end; s2 += 8, d2++)
178 ia = (last_a + 2*s2[3] + s2[7])>>2;
179 last_a = s2[7];
180 if(ia < 0xff)
182 DWORD y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
183 DWORD u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
184 DWORD y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
185 DWORD v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
186 *d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
194 // CMemSubPic
197 CMemSubPic::CMemSubPic(SubPicDesc& spd, int alpha_blt_dst_type)
198 : m_spd(spd), m_alpha_blt_dst_type(alpha_blt_dst_type)
200 m_maxsize.SetSize(spd.w, spd.h);
201 // m_rcDirty.SetRect(0, 0, spd.w, spd.h);
202 CRect allSpd(0,0,spd.w, spd.h);
203 m_rectListDirty.AddTail(allSpd);
206 CMemSubPic::~CMemSubPic()
208 delete [] m_spd.bits, m_spd.bits = NULL;
211 // ISubPic
213 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
215 return (void*)&m_spd;
218 STDMETHODIMP CMemSubPic::GetDesc(SubPicDesc& spd) const
220 spd.type = m_spd.type;
221 spd.w = m_size.cx;
222 spd.h = m_size.cy;
223 spd.bpp = m_spd.bpp;
224 spd.pitch = m_spd.pitch;
225 spd.bits = m_spd.bits;
226 spd.bitsU = m_spd.bitsU;
227 spd.bitsV = m_spd.bitsV;
228 spd.vidrect = m_vidrect;
229 return S_OK;
232 STDMETHODIMP CMemSubPic::CopyTo(ISubPicEx* pSubPic)
234 HRESULT hr;
235 if(FAILED(hr = __super::CopyTo(pSubPic))) {
236 return hr;
239 SubPicDesc src, dst;
240 if(FAILED(GetDesc(src)) || FAILED(pSubPic->GetDesc(dst))) {
241 return E_FAIL;
243 while(!m_rectListDirty.IsEmpty())
245 CRect& cRect = m_rectListDirty.GetHead();
246 int w = cRect.Width(), h = cRect.Height();
247 BYTE* s = (BYTE*)src.bits + src.pitch*cRect.top + cRect.left*4;
248 BYTE* d = (BYTE*)dst.bits + dst.pitch*cRect.top + cRect.left*4;
249 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
250 memcpy(d, s, w*4);
252 return S_OK;
255 STDMETHODIMP CMemSubPic::ClearDirtyRect(DWORD color)
257 if(m_rectListDirty.IsEmpty()) {
258 return S_OK;
260 while(!m_rectListDirty.IsEmpty())
262 //pDirtyRect = m_rectListDirty.RemoveHead();
263 CRect& dirtyRect = m_rectListDirty.RemoveTail();
264 BYTE* p = (BYTE*)m_spd.bits + m_spd.pitch*(dirtyRect.top) + dirtyRect.left*(m_spd.bpp>>3);
265 int w = dirtyRect.Width();
266 if(m_spd.type!=MSP_AYUV_PLANAR)
268 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
270 #ifdef _WIN64
271 memsetd(p, color, w*4); // nya
272 #else
273 __asm
275 mov eax, color
276 mov ecx, w
277 mov edi, p
279 rep stosd
282 #endif
285 else
287 ///TODO:
288 ///FIX ME
289 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
291 // memsetd(p, 0, m_rcDirty.Width());
292 //DbgLog((LOG_TRACE, 3, "w:%d", w));
293 //w = pDirtyRect->Width();
294 memset(p, 0xFF, w);
295 memset(p+m_spd.h*m_spd.pitch, 0, w);
296 memset(p+m_spd.h*m_spd.pitch*2, 0, w);
297 memset(p+m_spd.h*m_spd.pitch*3, 0, w);
301 m_rectListDirty.RemoveAll();
302 return S_OK;
305 STDMETHODIMP CMemSubPic::Lock(SubPicDesc& spd)
307 return GetDesc(spd);
310 STDMETHODIMP CMemSubPic::Unlock( CAtlList<CRect>* dirtyRectList )
312 int src_type = m_spd.type;
313 int dst_type = m_alpha_blt_dst_type;
314 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
315 dst_type == MSP_RGB24 ||
316 dst_type == MSP_RGB16 ||
317 dst_type == MSP_RGB15))
319 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
321 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
323 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
324 dst_type == MSP_YV12 ||
325 dst_type == MSP_P010 ||
326 dst_type == MSP_P016 ||
327 dst_type == MSP_NV12 ||
328 dst_type == MSP_NV21)))
330 return UnlockOther(dirtyRectList);
332 else if(src_type==MSP_RGBA && (dst_type == MSP_YUY2 ||
333 dst_type == MSP_AYUV || //ToDo: fix me MSP_AYUV
334 dst_type == MSP_IYUV ||
335 dst_type == MSP_YV12 ||
336 dst_type == MSP_NV12 ||
337 dst_type == MSP_NV21 ||
338 dst_type == MSP_P010 ||
339 dst_type == MSP_P016))
341 return UnlockRGBA_YUV(dirtyRectList);
343 return E_NOTIMPL;
346 HRESULT CMemSubPic::UnlockOther(CAtlList<CRect>* dirtyRectList)
348 SetDirtyRectEx(dirtyRectList);
349 if(m_rectListDirty.IsEmpty()) {
350 return S_OK;
353 POSITION pos = m_rectListDirty.GetHeadPosition();
354 while(pos!=NULL)
356 const CRect& cRect = m_rectListDirty.GetNext(pos);
357 int w = cRect.Width(), h = cRect.Height();
358 if (w<=0 || h<=0)
360 continue;
363 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*(cRect.top) + cRect.left*4;
364 BYTE* bottom = top + m_spd.pitch*h;
365 if(m_alpha_blt_dst_type == MSP_RGB16)
367 for(; top < bottom ; top += m_spd.pitch)
369 DWORD* s = (DWORD*)top;
370 DWORD* e = s + w;
371 for(; s < e; s++)
373 *s = ((*s>>3)&0x1f000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
374 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
378 else if(m_alpha_blt_dst_type == MSP_RGB15)
380 for(; top < bottom; top += m_spd.pitch)
382 DWORD* s = (DWORD*)top;
383 DWORD* e = s + w;
384 for(; s < e; s++)
386 *s = ((*s>>3)&0x1f000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
387 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
391 else if(m_alpha_blt_dst_type == MSP_YUY2)
393 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top, m_spd.pitch*(h-1)) );
395 for(BYTE* tempTop=top; tempTop < bottom ; tempTop += m_spd.pitch)
397 BYTE* s = tempTop;
398 BYTE* e = s + w*4;
399 BYTE last_v = s[0], last_u=s[2];
400 for(; s < e; s+=8) // AUYV AUYV -> AxYU AxYV
402 BYTE tmp = s[4];
403 s[4] = (last_v + 2*s[0] + s[4] + 2)>>2;
404 last_v = tmp;
406 s[0] = (last_u + 2*s[2] + s[6] + 2)>>2;
407 last_u = s[6];
411 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top, m_spd.pitch*(h-1)) );
413 else if(m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV )
415 //nothing to do
417 else if ( m_alpha_blt_dst_type == MSP_P010 || m_alpha_blt_dst_type == MSP_P016
418 || m_alpha_blt_dst_type == MSP_NV12 )
420 SubsampleAndInterlace(cRect, true);
422 else if( m_alpha_blt_dst_type == MSP_NV21 )
424 SubsampleAndInterlace(cRect, false);
427 return S_OK;
430 HRESULT CMemSubPic::UnlockRGBA_YUV(CAtlList<CRect>* dirtyRectList)
432 //debug
433 ONCER( SaveRect2File(dirtyRectList->GetHead(), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.rect") );
434 ONCER( SaveArgb2File(m_spd, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.argb") );
436 SetDirtyRectEx(dirtyRectList);
438 ONCER( SaveRect2File(dirtyRectList->GetHead(), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.rect2") );
439 if(m_rectListDirty.IsEmpty()) {
440 return S_OK;
443 POSITION pos = m_rectListDirty.GetHeadPosition();
444 while(pos!=NULL)
446 const CRect& cRect = m_rectListDirty.GetNext(pos);
447 int w = cRect.Width(), h = cRect.Height();
448 if(w<=0 || h<=0)
450 continue;
453 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*cRect.top + cRect.left*4;
454 BYTE* bottom = top + m_spd.pitch*h;
456 if( m_alpha_blt_dst_type == MSP_YUY2 ||
457 m_alpha_blt_dst_type == MSP_YV12 ||
458 m_alpha_blt_dst_type == MSP_IYUV ||
459 m_alpha_blt_dst_type == MSP_P010 ||
460 m_alpha_blt_dst_type == MSP_P016 ||
461 m_alpha_blt_dst_type == MSP_NV12 ||
462 m_alpha_blt_dst_type == MSP_NV21) {
463 for(; top < bottom ; top += m_spd.pitch) {
464 BYTE* s = top;
465 BYTE* e = s + w*4;
466 DWORD last_yuv = ColorConvTable::PreMulArgb2Ayuv(s[3], s[2], s[1], s[0]);
467 for(; s < e; s+=8) { // ARGB ARGB -> AxYU AxYV
468 if((s[3]+s[7]+(last_yuv>>24)) < 0xff*3) {
469 DWORD tmp1 = ColorConvTable::PreMulArgb2Ayuv(s[3], s[2], s[1], s[0]);
470 DWORD tmp2 = ColorConvTable::PreMulArgb2Ayuv(s[7], s[6], s[5], s[4]);
472 s[1] = (tmp1>>16)&0xff;
473 s[5] = (tmp2>>16)&0xff;
475 s[0] = (((last_yuv>>8)&0xff) + 2*((tmp1>>8)&0xff) + ((tmp2>>8)&0xff) + 2)/4;
476 s[4] = ((last_yuv&0xff) + 2*(tmp1&0xff) + (tmp2&0xff) + 2)/4;
477 last_yuv = tmp2;
478 } else {
479 last_yuv = ColorConvTable::PreMulArgb2Ayuv(s[7], s[6], s[5], s[4]);
481 s[1] = s[5] = 0;
482 s[0] = s[4] = 0;
487 else if(m_alpha_blt_dst_type == MSP_AYUV) {
488 for(; top < bottom ; top += m_spd.pitch) {
489 BYTE* s = top;
490 BYTE* e = s + w*4;
491 for(; s < e; s+=4) { // ARGB -> AYUV
492 if(s[3] < 0xff) {
493 *((DWORD*)s) = ColorConvTable::PreMulArgb2Ayuv(s[3], s[2], s[1], s[0]);
494 } else {
495 s[0] = s[1] = 0;
496 s[2] = 0;
503 ONCER( SaveAxxx2File(m_spd, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.axuv") );
504 return S_OK;
507 void CMemSubPic::SubsampleAndInterlace( const CRect& cRect, bool u_first )
509 //fix me: check alignment and log error
510 int w = cRect.Width(), h = cRect.Height();
511 BYTE* u_plan = reinterpret_cast<BYTE*>(m_spd.bits) + m_spd.pitch*m_spd.h*2;
512 BYTE* u_start = u_plan + m_spd.pitch*(cRect.top)+ cRect.left;
513 BYTE* v_start = u_start + m_spd.pitch*m_spd.h;
514 BYTE* dst = u_start;
515 if(!u_first)
517 BYTE* tmp = v_start;
518 v_start = u_start;
519 u_start = tmp;
522 //Todo: fix me.
523 //Walkarround for alignment
524 if ( ((m_spd.pitch|w) &15) == 0 && (g_cpuid.m_flags & CCpuID::sse2) )
526 ASSERT(w%16==0);
527 SubsampleAndInterlace(dst, u_start, v_start, h, w, m_spd.pitch);
529 else
531 SubsampleAndInterlaceC(dst, u_start, v_start, h, w, m_spd.pitch);
535 STDMETHODIMP CMemSubPic::AlphaBlt( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
537 if(!pSrc || !pDst || !pTarget) {
538 return E_POINTER;
540 int src_type = m_spd.type;
541 int dst_type = pTarget->type;
543 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
544 dst_type == MSP_RGB24 ||
545 dst_type == MSP_RGB16 ||
546 dst_type == MSP_RGB15 ||
547 dst_type == MSP_RGBA ||
548 dst_type == MSP_YUY2 ||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
549 dst_type == MSP_AYUV ))
551 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
553 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
555 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
556 dst_type == MSP_YV12)) )
558 return AlphaBltOther(pSrc, pDst, pTarget);
560 else if ( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_NV12 ||
561 dst_type == MSP_NV21 ) )
563 return AlphaBltAnv12_Nv12(pSrc, pDst, pTarget);
566 else if( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_P010 ||
567 dst_type == MSP_P016 ) )
569 return AlphaBltAnv12_P010(pSrc, pDst, pTarget);
571 else if( src_type==MSP_RGBA && (dst_type == MSP_IYUV ||
572 dst_type == MSP_YV12))
574 return AlphaBltAxyuAxyv_Yv12(pSrc, pDst, pTarget);
576 else if( src_type==MSP_RGBA && (dst_type == MSP_NV12||
577 dst_type == MSP_NV21))
579 return AlphaBltAxyuAxyv_Nv12(pSrc, pDst, pTarget);
581 else if( src_type==MSP_RGBA && (dst_type == MSP_P010 ||
582 dst_type == MSP_P016))
584 return AlphaBltAxyuAxyv_P010(pSrc, pDst, pTarget);
586 return E_NOTIMPL;
589 HRESULT CMemSubPic::AlphaBltOther(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
591 const SubPicDesc& src = m_spd;
592 SubPicDesc dst = *pTarget; // copy, because we might modify it
594 CRect rs(*pSrc), rd(*pDst);
595 if(dst.h < 0)
597 dst.h = -dst.h;
598 rd.bottom = dst.h - rd.bottom;
599 rd.top = dst.h - rd.top;
601 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
602 return E_INVALIDARG;
604 int w = rs.Width(), h = rs.Height();
605 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);//rs.left*4
606 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + ((rd.left*dst.bpp)>>3);
607 if(rd.top > rd.bottom)
609 if(dst.type == MSP_RGB32 || dst.type == MSP_RGB24
610 || dst.type == MSP_RGB16 || dst.type == MSP_RGB15
611 || dst.type == MSP_YUY2 || dst.type == MSP_AYUV)
613 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*dst.bpp>>3);
615 else if(dst.type == MSP_YV12 || dst.type == MSP_IYUV)
617 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*8>>3);
619 else
621 return E_NOTIMPL;
623 dst.pitch = -dst.pitch;
625 DbgLog((LOG_TRACE, 5, TEXT("w=%d h=%d"), w, h));
626 switch(dst.type)
628 case MSP_RGBA:
629 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
631 BYTE* s2 = s;
632 BYTE* s2end = s2 + w*4;
633 DWORD* d2 = (DWORD*)d;
634 for(; s2 < s2end; s2 += 4, d2++)
636 if(s2[3] < 0xff)
638 DWORD bd =0x00000100 -( (DWORD) s2[3]);
639 DWORD B = ((*((DWORD*)s2)&0x000000ff)<<8)/bd;
640 DWORD V = ((*((DWORD*)s2)&0x0000ff00)/bd)<<8;
641 DWORD R = (((*((DWORD*)s2)&0x00ff0000)>>8)/bd)<<16;
642 *d2 = B | V | R
643 | (0xff000000-(*((DWORD*)s2)&0xff000000))&0xff000000;
647 break;
648 case MSP_RGB32:
649 case MSP_AYUV: //ToDo: fix me MSP_VUYA indeed?
650 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
652 BYTE* s2 = s;
653 BYTE* s2end = s2 + w*4;
654 DWORD* d2 = (DWORD*)d;
655 for(; s2 < s2end; s2 += 4, d2++)
657 #ifdef _WIN64
658 DWORD ia = 256-s2[3];
659 if(s2[3] < 0xff) {
660 *d2 = ((((*d2&0x00ff00ff)*s2[3])>>8) + (((*((DWORD*)s2)&0x00ff00ff)*ia)>>8)&0x00ff00ff)
661 | ((((*d2&0x0000ff00)*s2[3])>>8) + (((*((DWORD*)s2)&0x0000ff00)*ia)>>8)&0x0000ff00);
663 #else
664 if(s2[3] < 0xff)
666 *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
667 | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00);
669 #endif
672 break;
673 case MSP_RGB24:
674 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
676 BYTE* s2 = s;
677 BYTE* s2end = s2 + w*4;
678 BYTE* d2 = d;
679 for(; s2 < s2end; s2 += 4, d2 += 3)
681 if(s2[3] < 0xff)
683 d2[0] = ((d2[0]*s2[3])>>8) + s2[0];
684 d2[1] = ((d2[1]*s2[3])>>8) + s2[1];
685 d2[2] = ((d2[2]*s2[3])>>8) + s2[2];
689 break;
690 case MSP_RGB16:
691 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
693 BYTE* s2 = s;
694 BYTE* s2end = s2 + w*4;
695 WORD* d2 = (WORD*)d;
696 for(; s2 < s2end; s2 += 4, d2++)
698 if(s2[3] < 0x1f)
700 *d2 = (WORD)((((((*d2&0xf81f)*s2[3])>>5) + (*(DWORD*)s2&0xf81f))&0xf81f)
701 | (((((*d2&0x07e0)*s2[3])>>5) + (*(DWORD*)s2&0x07e0))&0x07e0));
702 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
703 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
704 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
709 break;
710 case MSP_RGB15:
711 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
713 BYTE* s2 = s;
714 BYTE* s2end = s2 + w*4;
715 WORD* d2 = (WORD*)d;
716 for(; s2 < s2end; s2 += 4, d2++)
718 if(s2[3] < 0x1f)
720 *d2 = (WORD)((((((*d2&0x7c1f)*s2[3])>>5) + (*(DWORD*)s2&0x7c1f))&0x7c1f)
721 | (((((*d2&0x03e0)*s2[3])>>5) + (*(DWORD*)s2&0x03e0))&0x03e0));
722 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
723 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
724 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
729 break;
730 case MSP_YUY2:
731 AlphaBlt_YUY2(w, h, d, dst.pitch, s, src.pitch);
732 break;
733 case MSP_YV12:
734 case MSP_IYUV:
736 //dst.pitch = abs(dst.pitch);
737 int h2 = h/2;
738 if(!dst.pitchUV)
740 dst.pitchUV = abs(dst.pitch)/2;
742 if(!dst.bitsU || !dst.bitsV)
744 dst.bitsU = (BYTE*)dst.bits + abs(dst.pitch)*dst.h;
745 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
746 if(dst.type == MSP_YV12)
748 BYTE* p = dst.bitsU;
749 dst.bitsU = dst.bitsV;
750 dst.bitsV = p;
753 BYTE* dd[2];
754 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
755 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
756 if(rd.top > rd.bottom)
758 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
759 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
760 dst.pitchUV = -dst.pitchUV;
763 BYTE* src_origin= (BYTE*)src.bits + src.pitch*rs.top + rs.left;
765 BYTE* ss[2];
766 ss[0] = src_origin + src.pitch*src.h*2;//U
767 ss[1] = src_origin + src.pitch*src.h*3;//V
769 AlphaBltYv12Luma( d, dst.pitch, w, h, src_origin + src.pitch*src.h, src_origin, src.pitch );
771 AlphaBltYv12Chroma( dd[0], dst.pitchUV, w, h2, ss[0], src_origin, src.pitch);
772 AlphaBltYv12Chroma( dd[1], dst.pitchUV, w, h2, ss[1], src_origin, src.pitch);
773 _mm_empty();
775 break;
776 default:
777 return E_NOTIMPL;
778 break;
781 //emmsÒª40¸öcpuÖÜÆÚ
782 //__asm emms;
783 return S_OK;
786 HRESULT CMemSubPic::AlphaBltAxyuAxyv_P010(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
788 const SubPicDesc& src = m_spd;
789 SubPicDesc dst = *pTarget; // copy, because we might modify it
791 CRect rs(*pSrc), rd(*pDst);
793 if(dst.h < 0) {
794 dst.h = -dst.h;
795 rd.bottom = dst.h - rd.bottom;
796 rd.top = dst.h - rd.top;
799 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
800 return E_INVALIDARG;
803 int w = rs.Width(), h = rs.Height();
806 BYTE* s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
807 BYTE* d = static_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
809 if(rd.top > rd.bottom) {
810 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
812 dst.pitch = -dst.pitch;
815 for(ptrdiff_t i=0; i<h; i++, s += src.pitch, d += dst.pitch)
817 BYTE* s2 = s;
818 BYTE* s2end = s2 + w*4;
819 WORD* d2 = reinterpret_cast<WORD*>(d);
820 for(; s2 < s2end; s2 += 4, d2++)
822 if(s2[3] < 0xff) {
823 d2[0] = ((d2[0]*s2[3])>>8) + (s2[1]<<8);
828 //UV
829 int h2 = h/2;
830 if(!dst.pitchUV)
832 dst.pitchUV = abs(dst.pitch);
834 if(!dst.bitsU || !dst.bitsV)
836 dst.bitsU = static_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
837 dst.bitsV = dst.bitsU + 2;
839 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
840 if(rd.top > rd.bottom)
842 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
843 dst.pitchUV = -dst.pitchUV;
846 s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
848 d = ddUV;
849 int pitch = src.pitch;
850 for(int j = 0; j < h2; j++, s += 2*src.pitch, d += dst.pitchUV )
852 BYTE* s2 = s;
853 WORD* d2=reinterpret_cast<WORD*>(d);
854 WORD* d2_end = reinterpret_cast<WORD*>(d+2*w);
855 DWORD last_alpha = s2[3]+s2[3+src.pitch];
856 for( ; d2<d2_end; s2+=8, d2+=2)
858 unsigned int ia = (
859 last_alpha +
860 (s2[3] + s2[3+src.pitch])*2 +
861 s2[3+4]+ s2[3+4+src.pitch]);
862 last_alpha = s2[3+4]+ s2[3+4+src.pitch];
863 if( ia!=0xFF*8 )
865 d2[0] = (((d2[0])*ia)>>11) + ((s2[0] + s2[0+src.pitch])<<7);
866 d2[1] = (((d2[1])*ia)>>11) + ((s2[4] + s2[4+src.pitch])<<7);
871 return S_OK;
874 HRESULT CMemSubPic::AlphaBltAxyuAxyv_Yv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
876 const SubPicDesc& src = m_spd;
877 SubPicDesc dst = *pTarget; // copy, because we might modify it
879 CRect rs(*pSrc), rd(*pDst);
881 if(dst.h < 0) {
882 dst.h = -dst.h;
883 rd.bottom = dst.h - rd.bottom;
884 rd.top = dst.h - rd.top;
887 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
888 return E_INVALIDARG;
891 int w = rs.Width(), h = rs.Height();
893 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
894 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
896 if(rd.top > rd.bottom) {
897 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
899 dst.pitch = -dst.pitch;
902 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
903 BYTE* s2 = s;
904 BYTE* s2end = s2 + w*4;
905 BYTE* d2 = d;
906 for(; s2 < s2end; s2 += 4, d2++) {
907 if(s2[3] < 0xff) {
908 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
912 dst.pitch = abs(dst.pitch);
914 int h2 = h/2;
916 if(!dst.pitchUV) {
917 dst.pitchUV = dst.pitch/2;
920 BYTE* ss[2];
921 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
922 ss[1] = ss[0] + 4;
924 if(!dst.bitsU || !dst.bitsV) {
925 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
926 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
928 if(dst.type == MSP_YV12) {
929 BYTE* p = dst.bitsU;
930 dst.bitsU = dst.bitsV;
931 dst.bitsV = p;
935 BYTE* dd[2];
936 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
937 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
939 if(rd.top > rd.bottom) {
940 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
941 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
942 dst.pitchUV = -dst.pitchUV;
945 for(ptrdiff_t i = 0; i < 2; i++) {
946 s = ss[i];
947 d = dd[i];
948 BYTE* a = ss[0]+3;
949 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, a += src.pitch*2) {
950 BYTE* s2 = s;
951 BYTE* s2end = s2 + w*4;
952 BYTE* d2 = d;
953 BYTE* a2 = a;
955 DWORD last_alpha = a2[0]+a2[0+src.pitch];
956 for(; s2 < s2end; s2 += 8, d2++, a2 += 8) {
957 unsigned int ia = (last_alpha + 2*(a2[0]+a2[0+src.pitch]) + a2[4] + a2[4+src.pitch] + 4 )>>3;
958 last_alpha = a2[4] + a2[4+src.pitch];
959 if(ia < 0xff) {
960 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
966 return S_OK;
969 HRESULT CMemSubPic::AlphaBltAxyuAxyv_Nv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
971 ONCER( SaveArgb2File(*pTarget, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.nv12") );
972 const SubPicDesc& src = m_spd;
973 SubPicDesc dst = *pTarget; // copy, because we might modify it
975 CRect rs(*pSrc), rd(*pDst);
977 if(dst.h < 0) {
978 dst.h = -dst.h;
979 rd.bottom = dst.h - rd.bottom;
980 rd.top = dst.h - rd.top;
983 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
984 return E_INVALIDARG;
987 int w = rs.Width(), h = rs.Height();
989 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
990 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
992 if(rd.top > rd.bottom) {
993 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
995 dst.pitch = -dst.pitch;
998 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
999 BYTE* s2 = s;
1000 BYTE* s2end = s2 + w*4;
1001 BYTE* d2 = d;
1002 for(; s2 < s2end; s2 += 4, d2++) {
1003 if(s2[3] < 0xff) {
1004 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1008 dst.pitch = abs(dst.pitch);
1010 int h2 = h/2;
1012 if(!dst.pitchUV) {
1013 dst.pitchUV = dst.pitch;
1016 BYTE* ss[2];
1017 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1018 ss[1] = ss[0] + 4;
1020 if(!dst.bitsU || !dst.bitsV) {
1021 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1022 dst.bitsV = dst.bitsU + 1;
1024 if(dst.type == MSP_NV21) {
1025 BYTE* p = dst.bitsU;
1026 dst.bitsU = dst.bitsV;
1027 dst.bitsV = p;
1031 BYTE* dd[2];
1032 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1033 dd[1] = dd[0]+1;
1035 if(rd.top > rd.bottom) {
1036 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1037 dd[1] = dd[0]+1;
1038 dst.pitchUV = -dst.pitchUV;
1041 for(ptrdiff_t i = 0; i < 2; i++) {
1042 s = ss[i];
1043 d = dd[i];
1044 BYTE* a = ss[0]+3;
1045 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, a += src.pitch*2) {
1046 BYTE* s2 = s;
1047 BYTE* s2end = s2 + w*4;
1048 BYTE* d2 = d;
1049 BYTE* a2 = a;
1050 DWORD last_alpha = a2[0]+a2[0+src.pitch];
1051 for(; s2 < s2end; s2 += 8, d2+=2, a2 += 8) {
1052 unsigned int ia = (last_alpha+2*(a2[0]+a2[0+src.pitch])+a2[4]+a2[4+src.pitch]+4)>>3;
1053 last_alpha = a2[4]+a2[4+src.pitch];
1054 if(ia < 0xff) {
1055 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1061 ONCER( SaveArgb2File(*pTarget, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.nv12_2") );
1062 return S_OK;
1065 HRESULT CMemSubPic::AlphaBltAnv12_P010( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1067 //fix me: check colorspace and log error
1068 const SubPicDesc& src = m_spd;
1069 SubPicDesc dst = *pTarget; // copy, because we might modify it
1071 CRect rs(*pSrc), rd(*pDst);
1072 if(dst.h < 0)
1074 dst.h = -dst.h;
1075 rd.bottom = dst.h - rd.bottom;
1076 rd.top = dst.h - rd.top;
1078 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1079 return E_INVALIDARG;
1081 int w = rs.Width(), h = rs.Height();
1082 bool bottom_down = rd.top > rd.bottom;
1084 BYTE* d = NULL;
1085 BYTE* dUV = NULL;
1086 if(!bottom_down)
1088 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1089 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*rd.top/2 + rd.left*2;
1091 else
1093 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left*2;
1094 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*(rd.top/2-1) + rd.left*2;
1095 dst.pitch = -dst.pitch;
1097 ASSERT(dst.pitchUV==0 || dst.pitchUV==abs(dst.pitch));
1099 const BYTE* sa = reinterpret_cast<const BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1100 const BYTE* sy = sa + src.pitch*src.h;
1101 const BYTE* s_uv = sy + src.pitch*src.h;//UV
1102 return AlphaBltAnv12_P010(sa, sy, s_uv, src.pitch, d, dUV, dst.pitch, w, h);
1105 HRESULT CMemSubPic::AlphaBltAnv12_Nv12( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1107 //fix me: check colorspace and log error
1108 const SubPicDesc& src = m_spd;
1109 SubPicDesc dst = *pTarget; // copy, because we might modify it
1111 CRect rs(*pSrc), rd(*pDst);
1112 if(dst.h < 0)
1114 dst.h = -dst.h;
1115 rd.bottom = dst.h - rd.bottom;
1116 rd.top = dst.h - rd.top;
1118 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1119 return E_INVALIDARG;
1121 int w = rs.Width(), h = rs.Height();
1122 bool bottom_down = rd.top > rd.bottom;
1124 BYTE* d = NULL;
1125 BYTE* dUV = NULL;
1126 if (!bottom_down)
1128 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left;
1129 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*rd.top/2 + rd.left;
1131 else
1133 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left;
1134 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*(rd.top/2-1) + rd.left;
1135 dst.pitch = -dst.pitch;
1137 ASSERT(dst.pitchUV==0 || dst.pitchUV==abs(dst.pitch));
1139 const BYTE* sa = reinterpret_cast<const BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1140 const BYTE* sy = sa + src.pitch*src.h;
1141 const BYTE* s_uv = sy + src.pitch*src.h;//UV
1143 return AlphaBltAnv12_Nv12(sa, sy, s_uv, src.pitch, d, dUV, dst.pitch, w, h);
1146 STDMETHODIMP CMemSubPic::SetDirtyRectEx(CAtlList<CRect>* dirtyRectList )
1148 //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1149 if(dirtyRectList!=NULL)
1151 POSITION pos = dirtyRectList->GetHeadPosition();
1152 if(m_spd.type == MSP_AYUV_PLANAR || m_alpha_blt_dst_type==MSP_IYUV || m_alpha_blt_dst_type==MSP_YV12
1153 || m_alpha_blt_dst_type==MSP_P010 || m_alpha_blt_dst_type==MSP_P016
1154 || m_alpha_blt_dst_type==MSP_NV12 || m_alpha_blt_dst_type==MSP_NV21 )
1156 while(pos!=NULL)
1158 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1159 cRectSrc.left &= ~15;
1160 cRectSrc.right = (cRectSrc.right+15)&~15;
1161 if(cRectSrc.right>m_spd.w)
1163 cRectSrc.right = m_spd.w;
1165 cRectSrc.top &= ~1;
1166 cRectSrc.bottom = (cRectSrc.bottom+1)&~1;
1169 else if(m_spd.type == MSP_XY_AUYV || m_alpha_blt_dst_type==MSP_YUY2)
1171 while(pos!=NULL)
1173 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1174 cRectSrc.left &= ~3;
1175 cRectSrc.right = (cRectSrc.right+3)&~3;
1179 return __super::SetDirtyRectEx(dirtyRectList);
1183 // static
1186 void CMemSubPic::AlphaBltYv12Luma(byte* dst, int dst_pitch,
1187 int w, int h,
1188 const byte* sub, const byte* alpha, int sub_pitch)
1190 if( (
1191 ((reinterpret_cast<intptr_t>(alpha) ^ reinterpret_cast<intptr_t>(sub))
1192 |(reinterpret_cast<intptr_t>(alpha) ^ reinterpret_cast<intptr_t>(dst))
1193 | static_cast<intptr_t>(sub_pitch)
1194 | static_cast<intptr_t>(dst_pitch) ) & 15 )==0
1195 && w > 32 && (g_cpuid.m_flags & CCpuID::sse2))
1197 int head = (16 - (reinterpret_cast<intptr_t>(alpha)&15))&15;
1198 int tail = (w-head) & 15;
1199 int w1 = w - head - tail;
1200 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
1202 const BYTE* sa = alpha;
1203 const BYTE* s2 = sub;
1204 const BYTE* s2end_mod16 = s2 + w1;
1205 const BYTE* s2end = s2 + w;
1206 BYTE* d2 = dst;
1208 for( ; (reinterpret_cast<intptr_t>(s2)&15) != 0; s2++, sa++, d2++)
1210 if(sa[0] < 0xff)
1212 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
1215 for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=16)
1217 pix_alpha_blend_yv12_luma_sse2(d2, sa, s2);
1219 for(; s2 < s2end; s2++, sa++, d2++)
1221 if(sa[0] < 0xff)
1223 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
1228 else //fix me: only a workaround for non-mod-16 size video
1230 CMemSubPic::AlphaBltYv12LumaC(dst, dst_pitch, w, h, sub, alpha, sub_pitch);
1234 void CMemSubPic::AlphaBltYv12LumaC( byte* dst, int dst_pitch, int w, int h, const byte* sub, const byte* alpha, int sub_pitch )
1236 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
1238 const BYTE* sa = alpha;
1239 const BYTE* s2 = sub;
1240 const BYTE* s2end = s2 + w;
1241 BYTE* d2 = dst;
1242 for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
1244 if(sa[0] < 0xff)
1246 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
1247 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
1253 void CMemSubPic::AlphaBltYv12Chroma(byte* dst_uv, int dst_pitch,
1254 int w, int chroma_h,
1255 const byte* src_uv, const byte* src_a, int src_pitch)
1257 if( (
1258 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_uv))
1259 |(reinterpret_cast<intptr_t>(src_a) ^ (2*reinterpret_cast<intptr_t>(dst_uv)))
1260 | static_cast<intptr_t>(src_pitch)
1261 | (2*static_cast<intptr_t>(dst_pitch)) ) & 15) ==0 &&
1262 w > 16 && (g_cpuid.m_flags & CCpuID::sse2))
1264 int head = (16 - (reinterpret_cast<intptr_t>(src_a)&15))&15;
1265 int tail = (w-head) & 15;
1266 int w00 = w - head - tail;
1268 int pitch = src_pitch;
1269 for(int j = 0; j < chroma_h; j++, src_uv += src_pitch*2, src_a += src_pitch*2, dst_uv += dst_pitch)
1271 hleft_vmid_mix_uv_yv12_c2(dst_uv, head, src_uv, src_a, src_pitch);
1272 hleft_vmid_mix_uv_yv12_sse2(dst_uv+(head>>1), w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0);
1273 hleft_vmid_mix_uv_yv12_c2(dst_uv+((head+w00)>>1), tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0);
1276 else//fix me: only a workaround for non-mod-16 size video
1278 AlphaBltYv12ChromaC(dst_uv, dst_pitch, w, chroma_h, src_uv, src_a, src_pitch);
1282 void CMemSubPic::AlphaBltYv12ChromaC( byte* dst, int dst_pitch, int w, int chroma_h, const byte* sub_chroma, const byte* alpha, int sub_pitch )
1284 for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
1286 hleft_vmid_mix_uv_yv12_c(dst, w, sub_chroma, alpha, sub_pitch);
1290 HRESULT CMemSubPic::AlphaBltAnv12_P010( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch,
1291 BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1293 if ( g_cpuid.m_flags & CCpuID::sse2 )
1295 const BYTE* sa = src_a;
1296 if( (
1297 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_y))
1298 |(reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(dst_y))
1299 | static_cast<intptr_t>(src_pitch)
1300 | static_cast<intptr_t>(dst_pitch) ) & 15 )==0 &&
1301 w > 32 )
1303 int head = (16 - reinterpret_cast<intptr_t>(src_a)&15)&15;
1304 int tail = (w - head) & 15;
1306 for(int i=0; i<h; i++, sa += src_pitch, src_y += src_pitch, dst_y += dst_pitch)
1308 const BYTE* sa2 = sa;
1309 const BYTE* s2 = src_y;
1310 const BYTE* s2end_mod16 = s2 + (w&~15);
1311 BYTE* d2 = dst_y;
1312 WORD* d_w=reinterpret_cast<WORD*>(dst_y);
1314 switch( head )//important: it is safe since w > 16
1316 case 15:
1317 #define _XY_MIX_ONE if(sa2[0] < 0xff) { d_w[0] = ((d_w[0]*sa2[0])>>8) + (s2[0]<<8); } sa2++;d_w++;s2++;
1318 _XY_MIX_ONE
1319 case 14:
1320 _XY_MIX_ONE
1321 case 13:
1322 _XY_MIX_ONE
1323 case 12:
1324 _XY_MIX_ONE
1325 case 11:
1326 _XY_MIX_ONE
1327 case 10:
1328 _XY_MIX_ONE
1329 case 9:
1330 _XY_MIX_ONE
1331 case 8:
1332 _XY_MIX_ONE
1333 case 7:
1334 _XY_MIX_ONE
1335 case 6:
1336 _XY_MIX_ONE
1337 case 5:
1338 _XY_MIX_ONE
1339 case 4:
1340 _XY_MIX_ONE
1341 case 3:
1342 _XY_MIX_ONE
1343 case 2:
1344 _XY_MIX_ONE
1345 case 1://fall through on purpose
1346 _XY_MIX_ONE
1348 for(; s2 < s2end_mod16; s2+=16, sa2+=16, d_w+=16)
1350 mix_16_y_p010_sse2( reinterpret_cast<BYTE*>(d_w), s2, sa2);
1352 switch( tail )//important: it is safe since w > 16
1354 case 15:
1355 _XY_MIX_ONE
1356 case 14:
1357 _XY_MIX_ONE
1358 case 13:
1359 _XY_MIX_ONE
1360 case 12:
1361 _XY_MIX_ONE
1362 case 11:
1363 _XY_MIX_ONE
1364 case 10:
1365 _XY_MIX_ONE
1366 case 9:
1367 _XY_MIX_ONE
1368 case 8:
1369 _XY_MIX_ONE
1370 case 7:
1371 _XY_MIX_ONE
1372 case 6:
1373 _XY_MIX_ONE
1374 case 5:
1375 _XY_MIX_ONE
1376 case 4:
1377 _XY_MIX_ONE
1378 case 3:
1379 _XY_MIX_ONE
1380 case 2:
1381 _XY_MIX_ONE
1382 case 1://fall through on purpose
1383 _XY_MIX_ONE
1387 else //fix me: only a workaround for non-mod-16 size video
1389 for(int i=0; i<h; i++, sa += src_pitch, src_y += src_pitch, dst_y += dst_pitch)
1391 const BYTE* sa2 = sa;
1392 const BYTE* s2 = src_y;
1393 const BYTE* s2end = s2 + w;
1394 WORD* d_w = reinterpret_cast<WORD*>(dst_y);
1395 for(; s2 < s2end; s2+=1, sa2+=1, d_w+=1)
1397 if(sa2[0] < 0xff)
1399 d_w[0] = ((d_w[0]*sa2[0])>>8) + (s2[0]<<8);
1404 //UV
1405 int h2 = h/2;
1406 BYTE* d = dst_uv;
1407 if( (
1408 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_uv))
1409 |(reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(dst_uv))
1410 | static_cast<intptr_t>(src_pitch)
1411 | static_cast<intptr_t>(dst_pitch) ) & 15) ==0 &&
1412 w > 16 )
1414 int head = (16-(reinterpret_cast<intptr_t>(src_a)&15))&15;
1415 int tail = (w-head) & 15;
1416 int w00 = w - head - tail;
1418 ASSERT(w>0);//the calls to mix may failed if w==0
1419 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1421 hleft_vmid_mix_uv_p010_c2(d, head, src_uv, src_a, src_pitch);
1422 hleft_vmid_mix_uv_p010_sse2(d+2*head, w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0);
1423 hleft_vmid_mix_uv_p010_c2(d+2*(head+w00), tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0);
1426 else
1428 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1430 hleft_vmid_mix_uv_p010_c(d, w, src_uv, src_a, src_pitch);
1433 _mm_empty();
1434 return S_OK;
1436 else
1438 return AlphaBltAnv12_P010_C(src_a, src_y, src_uv, src_pitch, dst_y, dst_uv, dst_pitch, w, h);
1442 HRESULT CMemSubPic::AlphaBltAnv12_P010_C( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch, BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1444 const BYTE* sa = src_a;
1445 for(int i=0; i<h; i++, sa += src_pitch, src_y += src_pitch, dst_y += dst_pitch)
1447 const BYTE* sa2 = sa;
1448 const BYTE* s2 = src_y;
1449 const BYTE* s2end = s2 + w;
1450 WORD* d2 = reinterpret_cast<WORD*>(dst_y);
1451 for(; s2 < s2end; s2+=1, sa2+=1, d2+=1)
1453 if(sa2[0] < 0xff)
1455 d2[0] = ((d2[0]*sa2[0])>>8) + (s2[0]<<8);
1459 //UV
1460 int h2 = h/2;
1461 BYTE* d = dst_uv;
1462 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1464 hleft_vmid_mix_uv_p010_c(d, w, src_uv, src_a, src_pitch);
1466 return S_OK;
1469 HRESULT CMemSubPic::AlphaBltAnv12_Nv12( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch,
1470 BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1472 AlphaBltYv12Luma( dst_y, dst_pitch, w, h, src_y, src_a, src_pitch );
1474 int h2 = h/2;
1475 if( (
1476 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_uv))
1477 |(reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(dst_uv))
1478 | static_cast<intptr_t>(src_pitch)
1479 | static_cast<intptr_t>(dst_pitch) ) & 15) ==0 &&
1480 w > 16 && (g_cpuid.m_flags & CCpuID::sse2) )
1482 BYTE* d = dst_uv;
1484 int head = (16-(reinterpret_cast<intptr_t>(src_a)&15))&15;
1485 int tail = (w-head) & 15;
1486 int w00 = w - head - tail;
1488 ASSERT(w>0);//the calls to mix may failed if w==0
1489 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1491 hleft_vmid_mix_uv_nv12_c2(d, head, src_uv, src_a, src_pitch);
1492 hleft_vmid_mix_uv_nv12_sse2(d+head, w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0);
1493 hleft_vmid_mix_uv_nv12_c2(d+head+w00, tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0);
1495 _mm_empty();
1497 else
1499 BYTE* d = dst_uv;
1500 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1502 hleft_vmid_mix_uv_nv12_c(d, w, src_uv, src_a, src_pitch);
1505 return S_OK;
1508 HRESULT CMemSubPic::AlphaBltAnv12_Nv12_C( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch, BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1510 AlphaBltYv12LumaC( dst_y, dst_pitch, w, h, src_y, src_a, src_pitch );
1511 int h2 = h/2;
1512 BYTE* d = dst_uv;
1513 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1515 hleft_vmid_mix_uv_nv12_c(d, w, src_uv, src_a, src_pitch);
1517 return S_OK;
1520 void CMemSubPic::SubsampleAndInterlace( BYTE* dst, const BYTE* u, const BYTE* v, int h, int w, int pitch )
1522 for (int i=0;i<h;i+=2)
1524 hleft_vmid_subsample_and_interlace_2_line_sse2(dst, u, v, w, pitch);
1525 u += 2*pitch;
1526 v += 2*pitch;
1527 dst += pitch;
1531 void CMemSubPic::SubsampleAndInterlaceC( BYTE* dst, const BYTE* u, const BYTE* v, int h, int w, int pitch )
1533 for (int i=0;i<h;i+=2)
1535 hleft_vmid_subsample_and_interlace_2_line_c(dst, u, v, w, pitch);
1536 u += 2*pitch;
1537 v += 2*pitch;
1538 dst += pitch;
1542 void CMemSubPic::AlphaBlt_YUY2(int w, int h, BYTE* d, int dstpitch, PCUINT8 s, int srcpitch)
1544 #ifdef _WIN64
1545 AlphaBlt_YUY2_C(w, h, d, dstpitch, s, srcpitch);
1546 #else
1547 AlphaBlt_YUY2_MMX(w, h, d, dstpitch, s, srcpitch);
1548 #endif
1552 // CMemSubPicAllocator
1555 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type, SIZE maxsize, int type/*=-1*/)
1556 : CSubPicExAllocatorImpl(maxsize, false, false)
1557 , m_alpha_blt_dst_type(alpha_blt_dst_type)
1558 , m_maxsize(maxsize)
1559 , m_type(type)
1561 if(m_type==-1)
1563 switch(alpha_blt_dst_type)
1565 case MSP_YUY2:
1566 m_type = MSP_XY_AUYV;
1567 break;
1568 case MSP_AYUV:
1569 m_type = MSP_AYUV;
1570 break;
1571 case MSP_IYUV:
1572 case MSP_YV12:
1573 case MSP_P010:
1574 case MSP_P016:
1575 case MSP_NV12:
1576 case MSP_NV21:
1577 m_type = MSP_AYUV_PLANAR;
1578 break;
1579 default:
1580 m_type = MSP_RGBA;
1581 break;
1586 // ISubPicAllocatorImpl
1588 bool CMemSubPicAllocator::AllocEx(bool fStatic, ISubPicEx** ppSubPic)
1590 if(!ppSubPic) {
1591 return false;
1593 SubPicDesc spd;
1594 spd.w = m_maxsize.cx;
1595 spd.h = m_maxsize.cy;
1596 spd.bpp = 32;
1597 spd.pitch = (spd.w*spd.bpp)>>3;
1598 spd.type = m_type;
1599 spd.bits = DNew BYTE[spd.pitch*spd.h];
1600 if(!spd.bits) {
1601 return false;
1603 *ppSubPic = DNew CMemSubPic(spd, m_alpha_blt_dst_type);
1604 if(!(*ppSubPic)) {
1605 return false;
1607 (*ppSubPic)->AddRef();
1608 return true;