Increase ParseScript cache from 30 to 90 seconds
[xy_vsfilter.git] / src / subpic / MemSubPic.cpp
blob8635aaed157009d468aafa8fab8865f79980d030
1 /*
2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
22 #include "stdafx.h"
23 #include "MemSubPic.h"
24 #include "color_conv_table.h"
26 #if 0
27 #include <fstream>
29 // debug functions
30 //
31 static void SaveRect2File(const CRect& cRect, const char * filename)
33 std::ofstream os(filename);
34 os<<cRect.left<<","<<cRect.top<<","<<cRect.right<<","<<cRect.bottom;
36 static void SaveAxxx2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
38 std::ofstream axxx(filename);
39 int w = cRect.Width(), h = cRect.Height();
41 BYTE* top = (BYTE*)spd.bits + spd.pitch*cRect.top + cRect.left*4;
42 BYTE* bottom = top + spd.pitch*h;
44 for(; top < bottom ; top += spd.pitch) {
45 BYTE* s = top;
46 BYTE* e = s + w*4;
47 for(; s < e; s+=4) { // ARGB ARGB -> AxYU AxYV
48 axxx<<(int)s[0]<<","<<(int)s[1]<<","<<(int)s[2]<<","<<(int)s[3];
49 if(s+4>=e)
51 axxx<<std::endl;
53 else
55 axxx<<",";
59 axxx.close();
61 static void SaveArgb2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
63 SaveAxxx2File(spd, cRect, filename);
65 static void SaveAyuv2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
67 SaveAxxx2File(spd, cRect, filename);
69 static void SaveNvxx2File(SubPicDesc& spd, const CRect& cRect, const char * filename)
71 std::ofstream os(filename);
72 int w = cRect.Width(), h = cRect.Height();
74 BYTE* top = (BYTE*)spd.bits;
75 BYTE* bottom = top + spd.pitch*h;
77 for(; top < bottom ; top += spd.pitch) {
78 BYTE* s = top;
79 BYTE* e = s + w;
81 BYTE* sY = s + spd.pitch*spd.h;
82 BYTE* sU = sY + spd.pitch*spd.h;
83 BYTE* sV = sU + 1;
84 for(; s < e; s++, sY++, sU+=2,sV+=2) {
85 os<<(int)s[0]<<","<<(int)sY[0]<<","<<(int)sU[0]<<","<<(int)sV[0];
86 if(s+1>=e)
88 os<<std::endl;
90 else
92 os<<",";
96 os.close();
99 #define ONCER(expr) {\
100 static bool entered=false;\
101 if(!entered)\
103 entered=true;\
104 expr;\
107 #else
108 #define ONCER(expr)
109 #endif
112 // alpha blend functions
114 #include "xy_intrinsics.h"
115 #include "../dsutil/vd.h"
117 #ifndef _WIN64
118 static void AlphaBlt_YUY2_MMX(int w, int h, BYTE* d, int dstpitch, PCUINT8 s, int srcpitch)
120 for(int j = 0; j < h; j++, s += srcpitch, d += dstpitch)
122 unsigned int ia, c;
123 PCUINT8 s2 = s;
124 PCUINT8 s2end = s2 + w*4;
125 DWORD* d2 = (DWORD*)d;
126 ASSERT(w>0);
127 int last_a = w>0?s2[3]:0;
128 for(; s2 < s2end; s2 += 8, d2++)
130 ia = (last_a + 2*s2[3] + s2[7])>>2;
131 last_a = s2[7];
132 if(ia < 0xff)
134 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
135 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
136 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
137 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
138 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
140 ia = (ia<<24)|(s2[7]<<16)|(ia<<8)|s2[3];
141 c = (s2[4]<<24)|(s2[5]<<16)|(s2[0]<<8)|s2[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
142 __asm
144 mov edi, d2
145 pxor mm0, mm0
146 movd mm2, c
147 punpcklbw mm2, mm0
148 movd mm3, [edi]
149 punpcklbw mm3, mm0
150 movd mm4, ia
151 punpcklbw mm4, mm0
152 psraw mm4, 1 //or else, overflow because psraw shift in sign bit
153 pmullw mm3, mm4
154 psraw mm3, 7
155 paddsw mm3, mm2
156 packuswb mm3, mm3
157 movd [edi], mm3
162 _mm_empty();
164 #endif
166 void AlphaBlt_YUY2_C(int w, int h, BYTE* d, int dstpitch, PCUINT8 s, int srcpitch)
168 for(int j = 0; j < h; j++, s += srcpitch, d += dstpitch)
170 DWORD ia;
171 PCUINT8 s2 = s;
172 PCUINT8 s2end = s2 + w*4;
173 DWORD* d2 = (DWORD*)d;
174 ASSERT(w>0);
175 int last_a = w>0?s2[3]:0;
176 for(; s2 < s2end; s2 += 8, d2++)
178 ia = (last_a + 2*s2[3] + s2[7])>>2;
179 last_a = s2[7];
180 if(ia < 0xff)
182 DWORD y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
183 DWORD u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
184 DWORD y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
185 DWORD v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
186 *d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
194 // CMemSubPic
197 CMemSubPic::CMemSubPic(SubPicDesc& spd, int alpha_blt_dst_type)
198 : m_spd(spd), m_alpha_blt_dst_type(alpha_blt_dst_type)
200 m_maxsize.SetSize(spd.w, spd.h);
201 // m_rcDirty.SetRect(0, 0, spd.w, spd.h);
202 CRect allSpd(0,0,spd.w, spd.h);
203 m_rectListDirty.AddTail(allSpd);
206 CMemSubPic::~CMemSubPic()
208 delete [] m_spd.bits, m_spd.bits = NULL;
211 // ISubPic
213 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
215 return (void*)&m_spd;
218 STDMETHODIMP CMemSubPic::GetDesc(SubPicDesc& spd) const
220 spd.type = m_spd.type;
221 spd.w = m_size.cx;
222 spd.h = m_size.cy;
223 spd.bpp = m_spd.bpp;
224 spd.pitch = m_spd.pitch;
225 spd.bits = m_spd.bits;
226 spd.bitsU = m_spd.bitsU;
227 spd.bitsV = m_spd.bitsV;
228 spd.vidrect = m_vidrect;
229 return S_OK;
232 STDMETHODIMP CMemSubPic::CopyTo(ISubPicEx* pSubPic)
234 HRESULT hr;
235 if(FAILED(hr = __super::CopyTo(pSubPic))) {
236 return hr;
239 SubPicDesc src, dst;
240 if(FAILED(GetDesc(src)) || FAILED(pSubPic->GetDesc(dst))) {
241 return E_FAIL;
243 while(!m_rectListDirty.IsEmpty())
245 CRect& cRect = m_rectListDirty.GetHead();
246 int w = cRect.Width(), h = cRect.Height();
247 BYTE* s = (BYTE*)src.bits + src.pitch*cRect.top + cRect.left*4;
248 BYTE* d = (BYTE*)dst.bits + dst.pitch*cRect.top + cRect.left*4;
249 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
250 memcpy(d, s, w*4);
252 return S_OK;
255 STDMETHODIMP CMemSubPic::ClearDirtyRect(DWORD color)
257 if(m_rectListDirty.IsEmpty()) {
258 return S_OK;
260 while(!m_rectListDirty.IsEmpty())
262 //pDirtyRect = m_rectListDirty.RemoveHead();
263 CRect& dirtyRect = m_rectListDirty.RemoveTail();
264 BYTE* p = (BYTE*)m_spd.bits + m_spd.pitch*(dirtyRect.top) + dirtyRect.left*(m_spd.bpp>>3);
265 int w = dirtyRect.Width();
266 if(m_spd.type!=MSP_AYUV_PLANAR)
268 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
270 #ifdef _WIN64
271 memsetd(p, color, w*4); // nya
272 #else
273 __asm
275 mov eax, color
276 mov ecx, w
277 mov edi, p
279 rep stosd
282 #endif
285 else
287 ///TODO:
288 ///FIX ME
289 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
291 // memsetd(p, 0, m_rcDirty.Width());
292 //DbgLog((LOG_TRACE, 3, "w:%d", w));
293 //w = pDirtyRect->Width();
294 memset(p, 0xFF, w);
295 memset(p+m_spd.h*m_spd.pitch, 0, w);
296 memset(p+m_spd.h*m_spd.pitch*2, 0, w);
297 memset(p+m_spd.h*m_spd.pitch*3, 0, w);
301 m_rectListDirty.RemoveAll();
302 return S_OK;
305 STDMETHODIMP CMemSubPic::Lock(SubPicDesc& spd)
307 return GetDesc(spd);
310 STDMETHODIMP CMemSubPic::Unlock( CAtlList<CRect>* dirtyRectList )
312 int src_type = m_spd.type;
313 int dst_type = m_alpha_blt_dst_type;
314 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
315 dst_type == MSP_RGB24 ||
316 dst_type == MSP_RGB16 ||
317 dst_type == MSP_RGB15))
319 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
321 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
323 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
324 dst_type == MSP_YV12 ||
325 dst_type == MSP_P010 ||
326 dst_type == MSP_P016 ||
327 dst_type == MSP_NV12 ||
328 dst_type == MSP_NV21)))
330 return UnlockOther(dirtyRectList);
332 else if(src_type==MSP_RGBA && (dst_type == MSP_YUY2 ||
333 dst_type == MSP_AYUV || //ToDo: fix me MSP_AYUV
334 dst_type == MSP_IYUV ||
335 dst_type == MSP_YV12 ||
336 dst_type == MSP_NV12 ||
337 dst_type == MSP_NV21 ||
338 dst_type == MSP_P010 ||
339 dst_type == MSP_P016))
341 return UnlockRGBA_YUV(dirtyRectList);
343 return E_NOTIMPL;
346 HRESULT CMemSubPic::UnlockOther(CAtlList<CRect>* dirtyRectList)
348 SetDirtyRectEx(dirtyRectList);
349 if(m_rectListDirty.IsEmpty()) {
350 return S_OK;
353 POSITION pos = m_rectListDirty.GetHeadPosition();
354 while(pos!=NULL)
356 const CRect& cRect = m_rectListDirty.GetNext(pos);
357 int w = cRect.Width(), h = cRect.Height();
358 if (w<=0 || h<=0)
360 continue;
363 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*(cRect.top) + cRect.left*4;
364 BYTE* bottom = top + m_spd.pitch*h;
365 if(m_alpha_blt_dst_type == MSP_RGB16)
367 for(; top < bottom ; top += m_spd.pitch)
369 DWORD* s = (DWORD*)top;
370 DWORD* e = s + w;
371 for(; s < e; s++)
373 *s = ((*s>>3)&0x1f000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
374 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
378 else if(m_alpha_blt_dst_type == MSP_RGB15)
380 for(; top < bottom; top += m_spd.pitch)
382 DWORD* s = (DWORD*)top;
383 DWORD* e = s + w;
384 for(; s < e; s++)
386 *s = ((*s>>3)&0x1f000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
387 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
391 else if(m_alpha_blt_dst_type == MSP_YUY2)
393 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top, m_spd.pitch*(h-1)) );
395 for(BYTE* tempTop=top; tempTop < bottom ; tempTop += m_spd.pitch)
397 BYTE* s = tempTop;
398 BYTE* e = s + w*4;
399 BYTE last_v = s[0], last_u=s[2];
400 for(; s < e; s+=8) // AUYV AUYV -> AxYU AxYV
402 BYTE tmp = s[4];
403 s[4] = (last_v + 2*s[0] + s[4] + 2)>>2;
404 last_v = tmp;
406 s[0] = (last_u + 2*s[2] + s[6] + 2)>>2;
407 last_u = s[6];
411 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top, m_spd.pitch*(h-1)) );
413 else if(m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV
414 || m_alpha_blt_dst_type == MSP_AYUV)
416 //nothing to do
418 else if ( m_alpha_blt_dst_type == MSP_P010 || m_alpha_blt_dst_type == MSP_P016
419 || m_alpha_blt_dst_type == MSP_NV12 )
421 SubsampleAndInterlace(cRect, true);
423 else if( m_alpha_blt_dst_type == MSP_NV21 )
425 SubsampleAndInterlace(cRect, false);
428 return S_OK;
431 HRESULT CMemSubPic::UnlockRGBA_YUV(CAtlList<CRect>* dirtyRectList)
433 //debug
434 ONCER( SaveRect2File(dirtyRectList->GetHead(), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.rect") );
435 ONCER( SaveArgb2File(m_spd, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.argb") );
437 SetDirtyRectEx(dirtyRectList);
439 ONCER( SaveRect2File(dirtyRectList->GetHead(), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.rect2") );
440 if(m_rectListDirty.IsEmpty()) {
441 return S_OK;
444 POSITION pos = m_rectListDirty.GetHeadPosition();
445 while(pos!=NULL)
447 const CRect& cRect = m_rectListDirty.GetNext(pos);
448 int w = cRect.Width(), h = cRect.Height();
449 if(w<=0 || h<=0)
451 continue;
454 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*cRect.top + cRect.left*4;
455 BYTE* bottom = top + m_spd.pitch*h;
457 if( m_alpha_blt_dst_type == MSP_YUY2 ||
458 m_alpha_blt_dst_type == MSP_YV12 ||
459 m_alpha_blt_dst_type == MSP_IYUV ||
460 m_alpha_blt_dst_type == MSP_P010 ||
461 m_alpha_blt_dst_type == MSP_P016 ||
462 m_alpha_blt_dst_type == MSP_NV12 ||
463 m_alpha_blt_dst_type == MSP_NV21) {
464 for(; top < bottom ; top += m_spd.pitch) {
465 BYTE* s = top;
466 BYTE* e = s + w*4;
467 DWORD last_yuv = ColorConvTable::PreMulArgb2Ayuv(s[3], s[2], s[1], s[0]);
468 for(; s < e; s+=8) { // ARGB ARGB -> AxYU AxYV
469 if((s[3]+s[7]+(last_yuv>>24)) < 0xff*3) {
470 DWORD tmp1 = ColorConvTable::PreMulArgb2Ayuv(s[3], s[2], s[1], s[0]);
471 DWORD tmp2 = ColorConvTable::PreMulArgb2Ayuv(s[7], s[6], s[5], s[4]);
473 s[1] = (tmp1>>16)&0xff;
474 s[5] = (tmp2>>16)&0xff;
476 s[0] = (((last_yuv>>8)&0xff) + 2*((tmp1>>8)&0xff) + ((tmp2>>8)&0xff) + 2)/4;
477 s[4] = ((last_yuv&0xff) + 2*(tmp1&0xff) + (tmp2&0xff) + 2)/4;
478 last_yuv = tmp2;
479 } else {
480 last_yuv = ColorConvTable::PreMulArgb2Ayuv(s[7], s[6], s[5], s[4]);
482 s[1] = s[5] = 0;
483 s[0] = s[4] = 0;
488 else if(m_alpha_blt_dst_type == MSP_AYUV) {
489 for(; top < bottom ; top += m_spd.pitch) {
490 BYTE* s = top;
491 BYTE* e = s + w*4;
492 for(; s < e; s+=4) { // ARGB -> AYUV
493 if(s[3] < 0xff) {
494 *((DWORD*)s) = ColorConvTable::PreMulArgb2Ayuv(s[3], s[2], s[1], s[0]);
495 } else {
496 s[0] = s[1] = 0;
497 s[2] = 0;
504 ONCER( SaveAxxx2File(m_spd, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.axuv") );
505 return S_OK;
508 void CMemSubPic::SubsampleAndInterlace( const CRect& cRect, bool u_first )
510 //fix me: check alignment and log error
511 int w = cRect.Width(), h = cRect.Height();
512 BYTE* u_plan = reinterpret_cast<BYTE*>(m_spd.bits) + m_spd.pitch*m_spd.h*2;
513 BYTE* u_start = u_plan + m_spd.pitch*(cRect.top)+ cRect.left;
514 BYTE* v_start = u_start + m_spd.pitch*m_spd.h;
515 BYTE* dst = u_start;
516 if(!u_first)
518 BYTE* tmp = v_start;
519 v_start = u_start;
520 u_start = tmp;
523 //Todo: fix me.
524 //Walkarround for alignment
525 if ( ((m_spd.pitch|w) &15) == 0 && (g_cpuid.m_flags & CCpuID::sse2) )
527 ASSERT(w%16==0);
528 SubsampleAndInterlace(dst, u_start, v_start, h, w, m_spd.pitch);
530 else
532 SubsampleAndInterlaceC(dst, u_start, v_start, h, w, m_spd.pitch);
536 STDMETHODIMP CMemSubPic::AlphaBlt( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
538 if(!pSrc || !pDst || !pTarget) {
539 return E_POINTER;
541 int src_type = m_spd.type;
542 int dst_type = pTarget->type;
544 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
545 dst_type == MSP_RGB24 ||
546 dst_type == MSP_RGB16 ||
547 dst_type == MSP_RGB15 ||
548 dst_type == MSP_RGBA ||
549 dst_type == MSP_YUY2 ||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
550 dst_type == MSP_AYUV ))
552 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
554 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
556 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
557 dst_type == MSP_YV12)) )
559 return AlphaBltOther(pSrc, pDst, pTarget);
561 else if ( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_NV12 ||
562 dst_type == MSP_NV21 ) )
564 return AlphaBltAnv12_Nv12(pSrc, pDst, pTarget);
567 else if( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_P010 ||
568 dst_type == MSP_P016 ) )
570 return AlphaBltAnv12_P010(pSrc, pDst, pTarget);
572 else if( src_type==MSP_RGBA && (dst_type == MSP_IYUV ||
573 dst_type == MSP_YV12))
575 return AlphaBltAxyuAxyv_Yv12(pSrc, pDst, pTarget);
577 else if( src_type==MSP_RGBA && (dst_type == MSP_NV12||
578 dst_type == MSP_NV21))
580 return AlphaBltAxyuAxyv_Nv12(pSrc, pDst, pTarget);
582 else if( src_type==MSP_RGBA && (dst_type == MSP_P010 ||
583 dst_type == MSP_P016))
585 return AlphaBltAxyuAxyv_P010(pSrc, pDst, pTarget);
587 return E_NOTIMPL;
590 HRESULT CMemSubPic::AlphaBltOther(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
592 const SubPicDesc& src = m_spd;
593 SubPicDesc dst = *pTarget; // copy, because we might modify it
595 CRect rs(*pSrc), rd(*pDst);
596 if(dst.h < 0)
598 dst.h = -dst.h;
599 rd.bottom = dst.h - rd.bottom;
600 rd.top = dst.h - rd.top;
602 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
603 return E_INVALIDARG;
605 int w = rs.Width(), h = rs.Height();
606 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);//rs.left*4
607 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + ((rd.left*dst.bpp)>>3);
608 if(rd.top > rd.bottom)
610 if(dst.type == MSP_RGB32 || dst.type == MSP_RGB24
611 || dst.type == MSP_RGB16 || dst.type == MSP_RGB15
612 || dst.type == MSP_YUY2 || dst.type == MSP_AYUV)
614 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*dst.bpp>>3);
616 else if(dst.type == MSP_YV12 || dst.type == MSP_IYUV)
618 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*8>>3);
620 else
622 return E_NOTIMPL;
624 dst.pitch = -dst.pitch;
626 DbgLog((LOG_TRACE, 5, TEXT("w=%d h=%d"), w, h));
627 switch(dst.type)
629 case MSP_RGBA:
630 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
632 BYTE* s2 = s;
633 BYTE* s2end = s2 + w*4;
634 DWORD* d2 = (DWORD*)d;
635 for(; s2 < s2end; s2 += 4, d2++)
637 if(s2[3] < 0xff)
639 DWORD bd =0x00000100 -( (DWORD) s2[3]);
640 DWORD B = ((*((DWORD*)s2)&0x000000ff)<<8)/bd;
641 DWORD V = ((*((DWORD*)s2)&0x0000ff00)/bd)<<8;
642 DWORD R = (((*((DWORD*)s2)&0x00ff0000)>>8)/bd)<<16;
643 *d2 = B | V | R
644 | (0xff000000-(*((DWORD*)s2)&0xff000000))&0xff000000;
648 break;
649 case MSP_RGB32:
650 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
652 BYTE* s2 = s;
653 BYTE* s2end = s2 + w*4;
654 DWORD* d2 = (DWORD*)d;
655 for(; s2 < s2end; s2 += 4, d2++)
657 if(s2[3] < 0xff)
659 *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
660 | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00);
664 break;
665 case MSP_AYUV:
666 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
668 BYTE* s2 = s;
669 BYTE* s2end = s2 + w*4;
670 DWORD* d2 = (DWORD*)d;
671 for(; s2 < s2end; s2 += 4, d2++)
673 if(s2[3] < 0xff)
675 *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
676 | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00)
677 | (*d2&0xff000000);
681 break;
682 case MSP_RGB24:
683 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
685 BYTE* s2 = s;
686 BYTE* s2end = s2 + w*4;
687 BYTE* d2 = d;
688 for(; s2 < s2end; s2 += 4, d2 += 3)
690 if(s2[3] < 0xff)
692 d2[0] = ((d2[0]*s2[3])>>8) + s2[0];
693 d2[1] = ((d2[1]*s2[3])>>8) + s2[1];
694 d2[2] = ((d2[2]*s2[3])>>8) + s2[2];
698 break;
699 case MSP_RGB16:
700 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
702 BYTE* s2 = s;
703 BYTE* s2end = s2 + w*4;
704 WORD* d2 = (WORD*)d;
705 for(; s2 < s2end; s2 += 4, d2++)
707 if(s2[3] < 0x1f)
709 *d2 = (WORD)((((((*d2&0xf81f)*s2[3])>>5) + (*(DWORD*)s2&0xf81f))&0xf81f)
710 | (((((*d2&0x07e0)*s2[3])>>5) + (*(DWORD*)s2&0x07e0))&0x07e0));
711 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
712 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
713 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
718 break;
719 case MSP_RGB15:
720 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
722 BYTE* s2 = s;
723 BYTE* s2end = s2 + w*4;
724 WORD* d2 = (WORD*)d;
725 for(; s2 < s2end; s2 += 4, d2++)
727 if(s2[3] < 0x1f)
729 *d2 = (WORD)((((((*d2&0x7c1f)*s2[3])>>5) + (*(DWORD*)s2&0x7c1f))&0x7c1f)
730 | (((((*d2&0x03e0)*s2[3])>>5) + (*(DWORD*)s2&0x03e0))&0x03e0));
731 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
732 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
733 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
738 break;
739 case MSP_YUY2:
740 AlphaBlt_YUY2(w, h, d, dst.pitch, s, src.pitch);
741 break;
742 case MSP_YV12:
743 case MSP_IYUV:
745 //dst.pitch = abs(dst.pitch);
746 int h2 = h/2;
747 if(!dst.pitchUV)
749 dst.pitchUV = abs(dst.pitch)/2;
751 if(!dst.bitsU || !dst.bitsV)
753 dst.bitsU = (BYTE*)dst.bits + abs(dst.pitch)*dst.h;
754 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
755 if(dst.type == MSP_YV12)
757 BYTE* p = dst.bitsU;
758 dst.bitsU = dst.bitsV;
759 dst.bitsV = p;
762 BYTE* dd[2];
763 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
764 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
765 if(rd.top > rd.bottom)
767 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
768 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
769 dst.pitchUV = -dst.pitchUV;
772 BYTE* src_origin= (BYTE*)src.bits + src.pitch*rs.top + rs.left;
774 BYTE* ss[2];
775 ss[0] = src_origin + src.pitch*src.h*2;//U
776 ss[1] = src_origin + src.pitch*src.h*3;//V
778 AlphaBltYv12Luma( d, dst.pitch, w, h, src_origin + src.pitch*src.h, src_origin, src.pitch );
780 AlphaBltYv12Chroma( dd[0], dst.pitchUV, w, h2, ss[0], src_origin, src.pitch);
781 AlphaBltYv12Chroma( dd[1], dst.pitchUV, w, h2, ss[1], src_origin, src.pitch);
782 #ifndef _WIN64
783 // TODOX64 : fixme!
784 _mm_empty();
785 #endif
787 break;
788 default:
789 return E_NOTIMPL;
790 break;
793 //emmsÒª40¸öcpuÖÜÆÚ
794 //__asm emms;
795 return S_OK;
798 HRESULT CMemSubPic::AlphaBltAxyuAxyv_P010(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
800 const SubPicDesc& src = m_spd;
801 SubPicDesc dst = *pTarget; // copy, because we might modify it
803 CRect rs(*pSrc), rd(*pDst);
805 if(dst.h < 0) {
806 dst.h = -dst.h;
807 rd.bottom = dst.h - rd.bottom;
808 rd.top = dst.h - rd.top;
811 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
812 return E_INVALIDARG;
815 int w = rs.Width(), h = rs.Height();
818 BYTE* s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
819 BYTE* d = static_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
821 if(rd.top > rd.bottom) {
822 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
824 dst.pitch = -dst.pitch;
827 for(ptrdiff_t i=0; i<h; i++, s += src.pitch, d += dst.pitch)
829 BYTE* s2 = s;
830 BYTE* s2end = s2 + w*4;
831 WORD* d2 = reinterpret_cast<WORD*>(d);
832 for(; s2 < s2end; s2 += 4, d2++)
834 if(s2[3] < 0xff) {
835 d2[0] = ((d2[0]*s2[3])>>8) + (s2[1]<<8);
840 //UV
841 int h2 = h/2;
842 if(!dst.pitchUV)
844 dst.pitchUV = abs(dst.pitch);
846 if(!dst.bitsU || !dst.bitsV)
848 dst.bitsU = static_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
849 dst.bitsV = dst.bitsU + 2;
851 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
852 if(rd.top > rd.bottom)
854 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
855 dst.pitchUV = -dst.pitchUV;
858 s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
860 d = ddUV;
861 int pitch = src.pitch;
862 for(int j = 0; j < h2; j++, s += 2*src.pitch, d += dst.pitchUV )
864 BYTE* s2 = s;
865 WORD* d2=reinterpret_cast<WORD*>(d);
866 WORD* d2_end = reinterpret_cast<WORD*>(d+2*w);
867 DWORD last_alpha = s2[3]+s2[3+src.pitch];
868 for( ; d2<d2_end; s2+=8, d2+=2)
870 unsigned int ia = (
871 last_alpha +
872 (s2[3] + s2[3+src.pitch])*2 +
873 s2[3+4]+ s2[3+4+src.pitch]);
874 last_alpha = s2[3+4]+ s2[3+4+src.pitch];
875 if( ia!=0xFF*8 )
877 d2[0] = (((d2[0])*ia)>>11) + ((s2[0] + s2[0+src.pitch])<<7);
878 d2[1] = (((d2[1])*ia)>>11) + ((s2[4] + s2[4+src.pitch])<<7);
883 return S_OK;
886 HRESULT CMemSubPic::AlphaBltAxyuAxyv_Yv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
888 const SubPicDesc& src = m_spd;
889 SubPicDesc dst = *pTarget; // copy, because we might modify it
891 CRect rs(*pSrc), rd(*pDst);
893 if(dst.h < 0) {
894 dst.h = -dst.h;
895 rd.bottom = dst.h - rd.bottom;
896 rd.top = dst.h - rd.top;
899 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
900 return E_INVALIDARG;
903 int w = rs.Width(), h = rs.Height();
905 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
906 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
908 if(rd.top > rd.bottom) {
909 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
911 dst.pitch = -dst.pitch;
914 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
915 BYTE* s2 = s;
916 BYTE* s2end = s2 + w*4;
917 BYTE* d2 = d;
918 for(; s2 < s2end; s2 += 4, d2++) {
919 if(s2[3] < 0xff) {
920 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
924 dst.pitch = abs(dst.pitch);
926 int h2 = h/2;
928 if(!dst.pitchUV) {
929 dst.pitchUV = dst.pitch/2;
932 BYTE* ss[2];
933 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
934 ss[1] = ss[0] + 4;
936 if(!dst.bitsU || !dst.bitsV) {
937 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
938 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
940 if(dst.type == MSP_YV12) {
941 BYTE* p = dst.bitsU;
942 dst.bitsU = dst.bitsV;
943 dst.bitsV = p;
947 BYTE* dd[2];
948 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
949 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
951 if(rd.top > rd.bottom) {
952 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
953 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
954 dst.pitchUV = -dst.pitchUV;
957 for(ptrdiff_t i = 0; i < 2; i++) {
958 s = ss[i];
959 d = dd[i];
960 BYTE* a = ss[0]+3;
961 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, a += src.pitch*2) {
962 BYTE* s2 = s;
963 BYTE* s2end = s2 + w*4;
964 BYTE* d2 = d;
965 BYTE* a2 = a;
967 DWORD last_alpha = a2[0]+a2[0+src.pitch];
968 for(; s2 < s2end; s2 += 8, d2++, a2 += 8) {
969 unsigned int ia = (last_alpha + 2*(a2[0]+a2[0+src.pitch]) + a2[4] + a2[4+src.pitch] + 4 )>>3;
970 last_alpha = a2[4] + a2[4+src.pitch];
971 if(ia < 0xff) {
972 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
978 return S_OK;
981 HRESULT CMemSubPic::AlphaBltAxyuAxyv_Nv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
983 ONCER( SaveArgb2File(*pTarget, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.nv12") );
984 const SubPicDesc& src = m_spd;
985 SubPicDesc dst = *pTarget; // copy, because we might modify it
987 CRect rs(*pSrc), rd(*pDst);
989 if(dst.h < 0) {
990 dst.h = -dst.h;
991 rd.bottom = dst.h - rd.bottom;
992 rd.top = dst.h - rd.top;
995 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
996 return E_INVALIDARG;
999 int w = rs.Width(), h = rs.Height();
1001 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1002 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1004 if(rd.top > rd.bottom) {
1005 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1007 dst.pitch = -dst.pitch;
1010 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1011 BYTE* s2 = s;
1012 BYTE* s2end = s2 + w*4;
1013 BYTE* d2 = d;
1014 for(; s2 < s2end; s2 += 4, d2++) {
1015 if(s2[3] < 0xff) {
1016 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1020 dst.pitch = abs(dst.pitch);
1022 int h2 = h/2;
1024 if(!dst.pitchUV) {
1025 dst.pitchUV = dst.pitch;
1028 BYTE* ss[2];
1029 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1030 ss[1] = ss[0] + 4;
1032 if(!dst.bitsU || !dst.bitsV) {
1033 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1034 dst.bitsV = dst.bitsU + 1;
1036 if(dst.type == MSP_NV21) {
1037 BYTE* p = dst.bitsU;
1038 dst.bitsU = dst.bitsV;
1039 dst.bitsV = p;
1043 BYTE* dd[2];
1044 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1045 dd[1] = dd[0]+1;
1047 if(rd.top > rd.bottom) {
1048 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1049 dd[1] = dd[0]+1;
1050 dst.pitchUV = -dst.pitchUV;
1053 for(ptrdiff_t i = 0; i < 2; i++) {
1054 s = ss[i];
1055 d = dd[i];
1056 BYTE* a = ss[0]+3;
1057 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, a += src.pitch*2) {
1058 BYTE* s2 = s;
1059 BYTE* s2end = s2 + w*4;
1060 BYTE* d2 = d;
1061 BYTE* a2 = a;
1062 DWORD last_alpha = a2[0]+a2[0+src.pitch];
1063 for(; s2 < s2end; s2 += 8, d2+=2, a2 += 8) {
1064 unsigned int ia = (last_alpha+2*(a2[0]+a2[0+src.pitch])+a2[4]+a2[4+src.pitch]+4)>>3;
1065 last_alpha = a2[4]+a2[4+src.pitch];
1066 if(ia < 0xff) {
1067 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1073 ONCER( SaveArgb2File(*pTarget, CRect(CPoint(0,0), m_size), "F:/mplayer_MinGW_full/MinGW/home/Administrator/xy_vsfilter/debug.nv12_2") );
1074 return S_OK;
1077 HRESULT CMemSubPic::AlphaBltAnv12_P010( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1079 //fix me: check colorspace and log error
1080 const SubPicDesc& src = m_spd;
1081 SubPicDesc dst = *pTarget; // copy, because we might modify it
1083 CRect rs(*pSrc), rd(*pDst);
1084 if(dst.h < 0)
1086 dst.h = -dst.h;
1087 rd.bottom = dst.h - rd.bottom;
1088 rd.top = dst.h - rd.top;
1090 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1091 return E_INVALIDARG;
1093 int w = rs.Width(), h = rs.Height();
1094 bool bottom_down = rd.top > rd.bottom;
1096 BYTE* d = NULL;
1097 BYTE* dUV = NULL;
1098 if(!bottom_down)
1100 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1101 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*rd.top/2 + rd.left*2;
1103 else
1105 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left*2;
1106 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*(rd.top/2-1) + rd.left*2;
1107 dst.pitch = -dst.pitch;
1109 ASSERT(dst.pitchUV==0 || dst.pitchUV==abs(dst.pitch));
1111 const BYTE* sa = reinterpret_cast<const BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1112 const BYTE* sy = sa + src.pitch*src.h;
1113 const BYTE* s_uv = sy + src.pitch*src.h;//UV
1114 return AlphaBltAnv12_P010(sa, sy, s_uv, src.pitch, d, dUV, dst.pitch, w, h);
1117 HRESULT CMemSubPic::AlphaBltAnv12_Nv12( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1119 //fix me: check colorspace and log error
1120 const SubPicDesc& src = m_spd;
1121 SubPicDesc dst = *pTarget; // copy, because we might modify it
1123 CRect rs(*pSrc), rd(*pDst);
1124 if(dst.h < 0)
1126 dst.h = -dst.h;
1127 rd.bottom = dst.h - rd.bottom;
1128 rd.top = dst.h - rd.top;
1130 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1131 return E_INVALIDARG;
1133 int w = rs.Width(), h = rs.Height();
1134 bool bottom_down = rd.top > rd.bottom;
1136 BYTE* d = NULL;
1137 BYTE* dUV = NULL;
1138 if (!bottom_down)
1140 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left;
1141 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*rd.top/2 + rd.left;
1143 else
1145 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left;
1146 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*(rd.top/2-1) + rd.left;
1147 dst.pitch = -dst.pitch;
1149 ASSERT(dst.pitchUV==0 || dst.pitchUV==abs(dst.pitch));
1151 const BYTE* sa = reinterpret_cast<const BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1152 const BYTE* sy = sa + src.pitch*src.h;
1153 const BYTE* s_uv = sy + src.pitch*src.h;//UV
1155 return AlphaBltAnv12_Nv12(sa, sy, s_uv, src.pitch, d, dUV, dst.pitch, w, h);
1158 STDMETHODIMP CMemSubPic::SetDirtyRectEx(CAtlList<CRect>* dirtyRectList )
1160 //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1161 if(dirtyRectList!=NULL)
1163 POSITION pos = dirtyRectList->GetHeadPosition();
1164 if(m_spd.type == MSP_AYUV_PLANAR || m_alpha_blt_dst_type==MSP_IYUV || m_alpha_blt_dst_type==MSP_YV12
1165 || m_alpha_blt_dst_type==MSP_P010 || m_alpha_blt_dst_type==MSP_P016
1166 || m_alpha_blt_dst_type==MSP_NV12 || m_alpha_blt_dst_type==MSP_NV21 )
1168 while(pos!=NULL)
1170 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1171 cRectSrc.left &= ~15;
1172 cRectSrc.right = (cRectSrc.right+15)&~15;
1173 if(cRectSrc.right>m_spd.w)
1175 cRectSrc.right = m_spd.w;
1177 cRectSrc.top &= ~1;
1178 cRectSrc.bottom = (cRectSrc.bottom+1)&~1;
1179 ASSERT(cRectSrc.bottom<=m_spd.h);
1182 else if(m_spd.type == MSP_XY_AUYV || m_alpha_blt_dst_type==MSP_YUY2)
1184 while(pos!=NULL)
1186 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1187 cRectSrc.left &= ~3;
1188 cRectSrc.right = (cRectSrc.right+3)&~3;
1189 cRectSrc.right = cRectSrc.right < m_spd.w ? cRectSrc.right : m_spd.w;
1190 ASSERT((cRectSrc.right & 3)==0);
1194 return __super::SetDirtyRectEx(dirtyRectList);
1198 // static
1201 void CMemSubPic::AlphaBltYv12Luma(byte* dst, int dst_pitch,
1202 int w, int h,
1203 const byte* sub, const byte* alpha, int sub_pitch)
1205 if( (
1206 ((reinterpret_cast<intptr_t>(alpha) ^ reinterpret_cast<intptr_t>(sub))
1207 |(reinterpret_cast<intptr_t>(alpha) ^ reinterpret_cast<intptr_t>(dst))
1208 | static_cast<intptr_t>(sub_pitch)
1209 | static_cast<intptr_t>(dst_pitch) ) & 15 )==0
1210 && w > 32 && (g_cpuid.m_flags & CCpuID::sse2))
1212 int head = (16 - (reinterpret_cast<intptr_t>(alpha)&15))&15;
1213 int tail = (w-head) & 15;
1214 int w1 = w - head - tail;
1215 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
1217 const BYTE* sa = alpha;
1218 const BYTE* s2 = sub;
1219 const BYTE* s2end_mod16 = s2 + w1;
1220 const BYTE* s2end = s2 + w;
1221 BYTE* d2 = dst;
1223 for( ; (reinterpret_cast<intptr_t>(s2)&15) != 0; s2++, sa++, d2++)
1225 if(sa[0] < 0xff)
1227 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
1230 for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=16)
1232 pix_alpha_blend_yv12_luma_sse2(d2, sa, s2);
1234 for(; s2 < s2end; s2++, sa++, d2++)
1236 if(sa[0] < 0xff)
1238 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
1243 else //fix me: only a workaround for non-mod-16 size video
1245 CMemSubPic::AlphaBltYv12LumaC(dst, dst_pitch, w, h, sub, alpha, sub_pitch);
1249 void CMemSubPic::AlphaBltYv12LumaC( byte* dst, int dst_pitch, int w, int h, const byte* sub, const byte* alpha, int sub_pitch )
1251 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
1253 const BYTE* sa = alpha;
1254 const BYTE* s2 = sub;
1255 const BYTE* s2end = s2 + w;
1256 BYTE* d2 = dst;
1257 for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
1259 if(sa[0] < 0xff)
1261 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
1262 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
1268 void CMemSubPic::AlphaBltYv12Chroma(byte* dst_uv, int dst_pitch,
1269 int w, int chroma_h,
1270 const byte* src_uv, const byte* src_a, int src_pitch)
1272 if( (
1273 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_uv))
1274 |(reinterpret_cast<intptr_t>(src_a) ^ (2*reinterpret_cast<intptr_t>(dst_uv)))
1275 | static_cast<intptr_t>(src_pitch)
1276 | (2*static_cast<intptr_t>(dst_pitch)) ) & 15) ==0 &&
1277 w > 16 && (g_cpuid.m_flags & CCpuID::sse2))
1279 int head = (16 - (reinterpret_cast<intptr_t>(src_a)&15))&15;
1280 int tail = (w-head) & 15;
1281 int w00 = w - head - tail;
1283 int pitch = src_pitch;
1284 for(int j = 0; j < chroma_h; j++, src_uv += src_pitch*2, src_a += src_pitch*2, dst_uv += dst_pitch)
1286 hleft_vmid_mix_uv_yv12_c2(dst_uv, head, src_uv, src_a, src_pitch);
1287 hleft_vmid_mix_uv_yv12_sse2(dst_uv+(head>>1), w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0);
1288 hleft_vmid_mix_uv_yv12_c2(dst_uv+((head+w00)>>1), tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0);
1291 else//fix me: only a workaround for non-mod-16 size video
1293 AlphaBltYv12ChromaC(dst_uv, dst_pitch, w, chroma_h, src_uv, src_a, src_pitch);
1297 void CMemSubPic::AlphaBltYv12ChromaC( byte* dst, int dst_pitch, int w, int chroma_h, const byte* sub_chroma, const byte* alpha, int sub_pitch )
1299 for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
1301 hleft_vmid_mix_uv_yv12_c(dst, w, sub_chroma, alpha, sub_pitch);
1305 HRESULT CMemSubPic::AlphaBltAnv12_P010( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch,
1306 BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1308 if ( g_cpuid.m_flags & CCpuID::sse2 )
1310 const BYTE* sa = src_a;
1311 if( (
1312 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_y))
1313 |(reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(dst_y))
1314 | static_cast<intptr_t>(src_pitch)
1315 | static_cast<intptr_t>(dst_pitch) ) & 15 )==0 &&
1316 w > 32 )
1318 int head = (16 - reinterpret_cast<intptr_t>(src_a)&15)&15;
1319 int tail = (w - head) & 15;
1321 for(int i=0; i<h; i++, sa += src_pitch, src_y += src_pitch, dst_y += dst_pitch)
1323 const BYTE* sa2 = sa;
1324 const BYTE* s2 = src_y;
1325 const BYTE* s2end_mod16 = s2 + (w&~15);
1326 BYTE* d2 = dst_y;
1327 WORD* d_w=reinterpret_cast<WORD*>(dst_y);
1329 switch( head )//important: it is safe since w > 16
1331 case 15:
1332 #define _XY_MIX_ONE if(sa2[0] < 0xff) { d_w[0] = ((d_w[0]*sa2[0])>>8) + (s2[0]<<8); } sa2++;d_w++;s2++;
1333 _XY_MIX_ONE
1334 case 14:
1335 _XY_MIX_ONE
1336 case 13:
1337 _XY_MIX_ONE
1338 case 12:
1339 _XY_MIX_ONE
1340 case 11:
1341 _XY_MIX_ONE
1342 case 10:
1343 _XY_MIX_ONE
1344 case 9:
1345 _XY_MIX_ONE
1346 case 8:
1347 _XY_MIX_ONE
1348 case 7:
1349 _XY_MIX_ONE
1350 case 6:
1351 _XY_MIX_ONE
1352 case 5:
1353 _XY_MIX_ONE
1354 case 4:
1355 _XY_MIX_ONE
1356 case 3:
1357 _XY_MIX_ONE
1358 case 2:
1359 _XY_MIX_ONE
1360 case 1://fall through on purpose
1361 _XY_MIX_ONE
1363 for(; s2 < s2end_mod16; s2+=16, sa2+=16, d_w+=16)
1365 mix_16_y_p010_sse2( reinterpret_cast<BYTE*>(d_w), s2, sa2);
1367 switch( tail )//important: it is safe since w > 16
1369 case 15:
1370 _XY_MIX_ONE
1371 case 14:
1372 _XY_MIX_ONE
1373 case 13:
1374 _XY_MIX_ONE
1375 case 12:
1376 _XY_MIX_ONE
1377 case 11:
1378 _XY_MIX_ONE
1379 case 10:
1380 _XY_MIX_ONE
1381 case 9:
1382 _XY_MIX_ONE
1383 case 8:
1384 _XY_MIX_ONE
1385 case 7:
1386 _XY_MIX_ONE
1387 case 6:
1388 _XY_MIX_ONE
1389 case 5:
1390 _XY_MIX_ONE
1391 case 4:
1392 _XY_MIX_ONE
1393 case 3:
1394 _XY_MIX_ONE
1395 case 2:
1396 _XY_MIX_ONE
1397 case 1://fall through on purpose
1398 _XY_MIX_ONE
1402 else //fix me: only a workaround for non-mod-16 size video
1404 for(int i=0; i<h; i++, sa += src_pitch, src_y += src_pitch, dst_y += dst_pitch)
1406 const BYTE* sa2 = sa;
1407 const BYTE* s2 = src_y;
1408 const BYTE* s2end = s2 + w;
1409 WORD* d_w = reinterpret_cast<WORD*>(dst_y);
1410 for(; s2 < s2end; s2+=1, sa2+=1, d_w+=1)
1412 if(sa2[0] < 0xff)
1414 d_w[0] = ((d_w[0]*sa2[0])>>8) + (s2[0]<<8);
1419 //UV
1420 int h2 = h/2;
1421 BYTE* d = dst_uv;
1422 if( (
1423 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_uv))
1424 |(reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(dst_uv))
1425 | static_cast<intptr_t>(src_pitch)
1426 | static_cast<intptr_t>(dst_pitch) ) & 15) ==0 &&
1427 w > 16 )
1429 int head = (16-(reinterpret_cast<intptr_t>(src_a)&15))&15;
1430 int tail = (w-head) & 15;
1431 int w00 = w - head - tail;
1433 ASSERT(w>0);//the calls to mix may failed if w==0
1434 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1436 hleft_vmid_mix_uv_p010_c2(d, head, src_uv, src_a, src_pitch);
1437 hleft_vmid_mix_uv_p010_sse2(d+2*head, w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0);
1438 hleft_vmid_mix_uv_p010_c2(d+2*(head+w00), tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0);
1441 else
1443 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1445 hleft_vmid_mix_uv_p010_c(d, w, src_uv, src_a, src_pitch);
1448 #ifndef _WIN64
1449 // TODOX64 : fixme!
1450 _mm_empty();
1451 #endif
1452 return S_OK;
1454 else
1456 return AlphaBltAnv12_P010_C(src_a, src_y, src_uv, src_pitch, dst_y, dst_uv, dst_pitch, w, h);
1460 HRESULT CMemSubPic::AlphaBltAnv12_P010_C( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch, BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1462 const BYTE* sa = src_a;
1463 for(int i=0; i<h; i++, sa += src_pitch, src_y += src_pitch, dst_y += dst_pitch)
1465 const BYTE* sa2 = sa;
1466 const BYTE* s2 = src_y;
1467 const BYTE* s2end = s2 + w;
1468 WORD* d2 = reinterpret_cast<WORD*>(dst_y);
1469 for(; s2 < s2end; s2+=1, sa2+=1, d2+=1)
1471 if(sa2[0] < 0xff)
1473 d2[0] = ((d2[0]*sa2[0])>>8) + (s2[0]<<8);
1477 //UV
1478 int h2 = h/2;
1479 BYTE* d = dst_uv;
1480 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1482 hleft_vmid_mix_uv_p010_c(d, w, src_uv, src_a, src_pitch);
1484 return S_OK;
1487 HRESULT CMemSubPic::AlphaBltAnv12_Nv12( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch,
1488 BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1490 AlphaBltYv12Luma( dst_y, dst_pitch, w, h, src_y, src_a, src_pitch );
1492 int h2 = h/2;
1493 if( (
1494 ((reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(src_uv))
1495 |(reinterpret_cast<intptr_t>(src_a) ^ reinterpret_cast<intptr_t>(dst_uv))
1496 | static_cast<intptr_t>(src_pitch)
1497 | static_cast<intptr_t>(dst_pitch) ) & 15) ==0 &&
1498 w > 16 && (g_cpuid.m_flags & CCpuID::sse2) )
1500 BYTE* d = dst_uv;
1502 int head = (16-(reinterpret_cast<intptr_t>(src_a)&15))&15;
1503 int tail = (w-head) & 15;
1504 int w00 = w - head - tail;
1506 ASSERT(w>0);//the calls to mix may failed if w==0
1507 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1509 hleft_vmid_mix_uv_nv12_c2(d, head, src_uv, src_a, src_pitch);
1510 hleft_vmid_mix_uv_nv12_sse2(d+head, w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0);
1511 hleft_vmid_mix_uv_nv12_c2(d+head+w00, tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0);
1513 #ifndef _WIN64
1514 // TODOX64 : fixme!
1515 _mm_empty();
1516 #endif
1518 else
1520 BYTE* d = dst_uv;
1521 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1523 hleft_vmid_mix_uv_nv12_c(d, w, src_uv, src_a, src_pitch);
1526 return S_OK;
1529 HRESULT CMemSubPic::AlphaBltAnv12_Nv12_C( const BYTE* src_a, const BYTE* src_y, const BYTE* src_uv, int src_pitch, BYTE* dst_y, BYTE* dst_uv, int dst_pitch, int w, int h )
1531 AlphaBltYv12LumaC( dst_y, dst_pitch, w, h, src_y, src_a, src_pitch );
1532 int h2 = h/2;
1533 BYTE* d = dst_uv;
1534 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch)
1536 hleft_vmid_mix_uv_nv12_c(d, w, src_uv, src_a, src_pitch);
1538 return S_OK;
1541 void CMemSubPic::SubsampleAndInterlace( BYTE* dst, const BYTE* u, const BYTE* v, int h, int w, int pitch )
1543 for (int i=0;i<h;i+=2)
1545 hleft_vmid_subsample_and_interlace_2_line_sse2(dst, u, v, w, pitch);
1546 u += 2*pitch;
1547 v += 2*pitch;
1548 dst += pitch;
1552 void CMemSubPic::SubsampleAndInterlaceC( BYTE* dst, const BYTE* u, const BYTE* v, int h, int w, int pitch )
1554 for (int i=0;i<h;i+=2)
1556 hleft_vmid_subsample_and_interlace_2_line_c(dst, u, v, w, pitch);
1557 u += 2*pitch;
1558 v += 2*pitch;
1559 dst += pitch;
1563 void CMemSubPic::AlphaBlt_YUY2(int w, int h, BYTE* d, int dstpitch, PCUINT8 s, int srcpitch)
1565 #ifdef _WIN64
1566 AlphaBlt_YUY2_C(w, h, d, dstpitch, s, srcpitch);
1567 #else
1568 AlphaBlt_YUY2_MMX(w, h, d, dstpitch, s, srcpitch);
1569 #endif
1573 // CMemSubPicAllocator
1576 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type, SIZE maxsize, int type/*=-1*/)
1577 : CSubPicExAllocatorImpl(maxsize, false, false)
1578 , m_alpha_blt_dst_type(alpha_blt_dst_type)
1579 , m_maxsize(maxsize)
1580 , m_type(type)
1582 if(m_type==-1)
1584 switch(alpha_blt_dst_type)
1586 case MSP_YUY2:
1587 m_type = MSP_XY_AUYV;
1588 break;
1589 case MSP_AYUV:
1590 m_type = MSP_AYUV;
1591 break;
1592 case MSP_IYUV:
1593 case MSP_YV12:
1594 case MSP_P010:
1595 case MSP_P016:
1596 case MSP_NV12:
1597 case MSP_NV21:
1598 m_type = MSP_AYUV_PLANAR;
1599 break;
1600 default:
1601 m_type = MSP_RGBA;
1602 break;
1607 // ISubPicAllocatorImpl
1609 bool CMemSubPicAllocator::AllocEx(bool fStatic, ISubPicEx** ppSubPic)
1611 if(!ppSubPic) {
1612 return false;
1614 SubPicDesc spd;
1615 spd.w = m_maxsize.cx;
1616 spd.h = m_maxsize.cy;
1617 spd.bpp = 32;
1618 spd.pitch = (spd.w*spd.bpp)>>3;
1619 spd.type = m_type;
1620 spd.bits = DEBUG_NEW BYTE[spd.pitch*spd.h];
1621 if(!spd.bits) {
1622 return false;
1624 *ppSubPic = DEBUG_NEW CMemSubPic(spd, m_alpha_blt_dst_type);
1625 if(!(*ppSubPic)) {
1626 return false;
1628 (*ppSubPic)->AddRef();
1629 return true;