SSE2 code for P010/P016 (with a possible issue there, P010 alphablending may modify...
[xy_vsfilter.git] / src / subpic / MemSubPic.cpp
blob7c3e5ef2e2014d909c3787fb4a3e6e7b0d9cb8c8
1 /*
2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
22 #include "stdafx.h"
23 #include "MemSubPic.h"
24 #include "color_conv_table.h"
26 #define MIX_4_PIX_YV12(dst, zero_128i, c_128i, a_128i) \
27 { \
28 __m128i d_128i = _mm_cvtsi32_si128(*dst); \
29 _MIX_4_PIX_YV12(d_128i, zero_128i, c_128i, a_128i) \
30 *dst = (DWORD)_mm_cvtsi128_si32(d_128i); \
33 #define _MIX_4_PIX_YV12(dst_128i, zero_128i, c_128i, a_128i) \
34 { \
35 dst_128i = _mm_unpacklo_epi8(dst_128i, zero_128i); \
36 dst_128i = _mm_unpacklo_epi16(dst_128i, c_128i); \
37 dst_128i = _mm_madd_epi16(dst_128i, a_128i); \
38 dst_128i = _mm_srli_epi32(dst_128i, 8); \
39 dst_128i = _mm_packs_epi32(dst_128i, dst_128i); \
40 dst_128i = _mm_packus_epi16(dst_128i, dst_128i); \
43 #define AVERAGE_4_PIX(a,b) \
44 __asm pavgb a, b \
45 __asm movaps b, a \
46 __asm psrlw a, 8 \
47 __asm psllw b, 8 \
48 __asm psrlw b, 8 \
49 __asm pavgw a, b
51 #define SSE2_ALPHA_BLT_UV(dst, alpha_mask, src, src_pitch) \
52 __asm mov eax,src_pitch \
54 __asm xorps XMM0,XMM0 \
55 __asm mov esi, alpha_mask \
56 __asm movaps XMM1,[esi] \
57 __asm add esi, eax \
58 __asm movaps XMM2,[esi] \
60 __asm AVERAGE_4_PIX(XMM1, XMM2) \
61 __asm mov edi, dst \
62 __asm movlps XMM3,[edi] \
63 __asm punpcklbw XMM3,XMM0 \
64 __asm pmullw XMM3,XMM1 \
65 __asm psrlw XMM3,8 \
67 __asm mov esi, src \
68 __asm movaps XMM1,[esi] \
69 __asm add esi, eax \
70 __asm movaps XMM2,[esi] \
71 __asm AVERAGE_4_PIX(XMM1, XMM2) \
73 __asm paddw XMM3,XMM1 \
74 __asm packuswb XMM3,XMM0 \
76 __asm movdq2q MM0, XMM3 \
77 __asm movq [edi],MM0
80 // CMemSubPic
83 CMemSubPic::CMemSubPic(SubPicDesc& spd, int alpha_blt_dst_type)
84 : m_spd(spd), m_alpha_blt_dst_type(alpha_blt_dst_type)
86 m_maxsize.SetSize(spd.w, spd.h);
87 // m_rcDirty.SetRect(0, 0, spd.w, spd.h);
88 CRect allSpd(0,0,spd.w, spd.h);
89 m_rectListDirty.AddTail(allSpd);
92 CMemSubPic::~CMemSubPic()
94 delete [] m_spd.bits, m_spd.bits = NULL;
97 // ISubPic
99 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
101 return (void*)&m_spd;
104 STDMETHODIMP CMemSubPic::GetDesc(SubPicDesc& spd) const
106 spd.type = m_spd.type;
107 spd.w = m_size.cx;
108 spd.h = m_size.cy;
109 spd.bpp = m_spd.bpp;
110 spd.pitch = m_spd.pitch;
111 spd.bits = m_spd.bits;
112 spd.bitsU = m_spd.bitsU;
113 spd.bitsV = m_spd.bitsV;
114 spd.vidrect = m_vidrect;
115 return S_OK;
118 STDMETHODIMP CMemSubPic::CopyTo(ISubPicEx* pSubPic)
120 HRESULT hr;
121 if(FAILED(hr = __super::CopyTo(pSubPic))) {
122 return hr;
125 SubPicDesc src, dst;
126 if(FAILED(GetDesc(src)) || FAILED(pSubPic->GetDesc(dst))) {
127 return E_FAIL;
129 while(!m_rectListDirty.IsEmpty())
131 CRect& cRect = m_rectListDirty.GetHead();
132 int w = cRect.Width(), h = cRect.Height();
133 BYTE* s = (BYTE*)src.bits + src.pitch*cRect.top + cRect.left*4;
134 BYTE* d = (BYTE*)dst.bits + dst.pitch*cRect.top + cRect.left*4;
135 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
136 memcpy(d, s, w*4);
138 return S_OK;
141 STDMETHODIMP CMemSubPic::ClearDirtyRect(DWORD color)
143 if(m_rectListDirty.IsEmpty()) {
144 return S_OK;
146 while(!m_rectListDirty.IsEmpty())
148 //pDirtyRect = m_rectListDirty.RemoveHead();
149 CRect& dirtyRect = m_rectListDirty.RemoveTail();
150 BYTE* p = (BYTE*)m_spd.bits + m_spd.pitch*(dirtyRect.top) + dirtyRect.left*(m_spd.bpp>>3);
151 int w = dirtyRect.Width();
152 if(m_spd.type!=MSP_AY11)
154 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
156 #ifdef _WIN64
157 memsetd(p, color, w*4); // nya
158 #else
159 __asm
161 mov eax, color
162 mov ecx, w
163 mov edi, p
165 rep stosd
168 #endif
171 else
173 ///TODO:
174 ///FIX ME
175 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
177 // memsetd(p, 0, m_rcDirty.Width());
178 //DbgLog((LOG_TRACE, 3, "w:%d", w));
179 //w = pDirtyRect->Width();
180 memset(p, 0xFF, w);
181 memset(p+m_spd.h*m_spd.pitch, 0, w);
182 memset(p+m_spd.h*m_spd.pitch*2, 0, w);
183 memset(p+m_spd.h*m_spd.pitch*3, 0, w);
187 m_rectListDirty.RemoveAll();
188 return S_OK;
191 STDMETHODIMP CMemSubPic::Lock(SubPicDesc& spd)
193 return GetDesc(spd);
196 STDMETHODIMP CMemSubPic::Unlock( CAtlList<CRect>* dirtyRectList )
198 int src_type = m_spd.type;
199 int dst_type = m_alpha_blt_dst_type;
200 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
201 dst_type == MSP_RGB24 ||
202 dst_type == MSP_RGB16 ||
203 dst_type == MSP_RGB15))
205 (src_type==MSP_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
207 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
209 (src_type==MSP_AY11 && (dst_type == MSP_IYUV ||
210 dst_type == MSP_YV12 ||
211 dst_type == MSP_P010 ||
212 dst_type == MSP_P016)))
214 return UnlockOther(dirtyRectList);
216 else if(src_type==MSP_RGBA && (dst_type == MSP_YUY2 ||
217 dst_type == MSP_AYUV || //ToDo: fix me MSP_AYUV
218 dst_type == MSP_IYUV ||
219 dst_type == MSP_YV12))
221 return UnlockRGBA_YUV(dirtyRectList);
223 return E_NOTIMPL;
226 STDMETHODIMP CMemSubPic::UnlockOther(CAtlList<CRect>* dirtyRectList)
228 SetDirtyRectEx(dirtyRectList);
229 if(m_rectListDirty.IsEmpty()) {
230 return S_OK;
233 POSITION pos = m_rectListDirty.GetHeadPosition();
234 while(pos!=NULL)
236 const CRect& cRect = m_rectListDirty.GetNext(pos);
237 int w = cRect.Width(), h = cRect.Height();
238 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*(cRect.top) + cRect.left*4;
239 BYTE* bottom = top + m_spd.pitch*h;
240 if(m_alpha_blt_dst_type == MSP_RGB16)
242 for(; top < bottom ; top += m_spd.pitch)
244 DWORD* s = (DWORD*)top;
245 DWORD* e = s + w;
246 for(; s < e; s++)
248 *s = ((*s>>3)&0x1f000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
249 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
253 else if(m_alpha_blt_dst_type == MSP_RGB15)
255 for(; top < bottom; top += m_spd.pitch)
257 DWORD* s = (DWORD*)top;
258 DWORD* e = s + w;
259 for(; s < e; s++)
261 *s = ((*s>>3)&0x1f000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
262 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
266 else if(m_alpha_blt_dst_type == MSP_YUY2)
268 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top, m_spd.pitch*(h-1)) );
270 for(BYTE* tempTop=top; tempTop < bottom ; tempTop += m_spd.pitch)
272 BYTE* s = tempTop;
273 BYTE* e = s + w*4;
274 for(; s < e; s+=8) // AUYV AUYV -> AxYU AxYV
276 s[4] = (s[0] + s[4])>>1;
277 s[0] = (s[2] + s[6])>>1;
281 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top, m_spd.pitch*(h-1)) );
283 else if(m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV || m_alpha_blt_dst_type == MSP_YV12
284 || m_alpha_blt_dst_type == MSP_P010 || m_alpha_blt_dst_type == MSP_P016)
286 //nothing to do
289 return S_OK;
292 STDMETHODIMP CMemSubPic::UnlockRGBA_YUV(CAtlList<CRect>* dirtyRectList)
294 SetDirtyRectEx(dirtyRectList);
295 if(m_rectListDirty.IsEmpty()) {
296 return S_OK;
299 const ColorConvTable *conv_table = ColorConvTable::GetDefaultColorConvTable();
300 const int *c2y_yb = conv_table->c2y_yb;
301 const int *c2y_yg = conv_table->c2y_yg;
302 const int *c2y_yr = conv_table->c2y_yr;
303 const int cy_cy2 = conv_table->cy_cy2;
304 const int c2y_cu = conv_table->c2y_cu;
305 const int c2y_cv = conv_table->c2y_cv;
306 const int cy_cy = conv_table->cy_cy;
307 const unsigned char* Clip = conv_table->Clip;
309 POSITION pos = m_rectListDirty.GetHeadPosition();
310 while(pos!=NULL)
312 const CRect& cRect = m_rectListDirty.GetNext(pos);
313 int w = cRect.Width(), h = cRect.Height();
315 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*cRect.top + cRect.left*4;
316 BYTE* bottom = top + m_spd.pitch*h;
318 if(m_alpha_blt_dst_type == MSP_YUY2 || m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV) {
319 for(; top < bottom ; top += m_spd.pitch) {
320 BYTE* s = top;
321 BYTE* e = s + w*4;
322 for(; s < e; s+=8) { // ARGB ARGB -> AxYU AxYV
323 if((s[3]+s[7]) < 0x1fe) {
324 int a = 0x200 - (s[3]+s[7]);
325 a <<= 7;
326 // 0 <= a <= 0x10000
327 s[1] = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a + 0x8000) >> 16;
328 s[5] = (c2y_yb[s[4]] + c2y_yg[s[5]] + c2y_yr[s[6]] + 0x10*a + 0x8000) >> 16;
330 int scaled_y = (s[1]+s[5]-32) * cy_cy2;
332 s[0] = Clip[(((((s[0]+s[4])<<15) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
333 s[4] = Clip[(((((s[2]+s[6])<<15) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
334 } else {
335 s[1] = s[5] = 0;
336 s[0] = s[4] = 0;
341 else if(m_alpha_blt_dst_type == MSP_AYUV) {
342 for(; top < bottom ; top += m_spd.pitch) {
343 BYTE* s = top;
344 BYTE* e = s + w*4;
345 for(; s < e; s+=4) { // ARGB -> AYUV
346 if(s[3] < 0xff) {
347 int a = 0x100 - s[3];
348 a <<= 8;
349 // 0 <= a <= 0x10000
351 int y = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a + 0x8000) >> 16;
352 int scaled_y = (y-32) * cy_cy;
353 s[1] = Clip[((((s[0]<<16) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
354 s[0] = Clip[((((s[2]<<16) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
355 s[2] = y;
356 } else {
357 s[0] = s[1] = 0;
358 s[2] = 0;
364 return S_OK;
367 STDMETHODIMP CMemSubPic::AlphaBlt( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
369 if(!pSrc || !pDst || !pTarget) {
370 return E_POINTER;
372 int src_type = m_spd.type;
373 int dst_type = pTarget->type;
375 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
376 dst_type == MSP_RGB24 ||
377 dst_type == MSP_RGB16 ||
378 dst_type == MSP_RGB15 ||
379 dst_type == MSP_RGBA ||
380 dst_type == MSP_YUY2 ||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
381 dst_type == MSP_AYUV ))
383 (src_type==MSP_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
385 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
387 (src_type==MSP_AY11 && (dst_type == MSP_IYUV ||
388 dst_type == MSP_YV12 ||
389 dst_type == MSP_P010 ||
390 dst_type == MSP_P016 )) )
392 return AlphaBltOther(pSrc, pDst, pTarget);
394 else if( src_type==MSP_RGBA && (dst_type == MSP_IYUV ||
395 dst_type == MSP_YV12))
397 return AlphaBltAyuv_Yv12(pSrc, pDst, pTarget);
399 return E_NOTIMPL;
402 STDMETHODIMP CMemSubPic::AlphaBltOther(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
404 const SubPicDesc& src = m_spd;
405 SubPicDesc dst = *pTarget; // copy, because we might modify it
407 CRect rs(*pSrc), rd(*pDst);
408 if(dst.h < 0)
410 dst.h = -dst.h;
411 rd.bottom = dst.h - rd.bottom;
412 rd.top = dst.h - rd.top;
414 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
415 return E_INVALIDARG;
417 int w = rs.Width(), h = rs.Height();
418 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);//rs.left*4
419 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + ((rd.left*dst.bpp)>>3);
420 if(rd.top > rd.bottom)
422 if(dst.type == MSP_RGB32 || dst.type == MSP_RGB24
423 || dst.type == MSP_RGB16 || dst.type == MSP_RGB15
424 || dst.type == MSP_YUY2 || dst.type == MSP_AYUV)
426 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*dst.bpp>>3);
428 else if(dst.type == MSP_YV12 || dst.type == MSP_IYUV)
430 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*8>>3);
432 else
434 return E_NOTIMPL;
436 dst.pitch = -dst.pitch;
438 DbgLog((LOG_TRACE, 5, TEXT("w=%d h=%d"), w, h));
439 switch(dst.type)
441 case MSP_RGBA:
442 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
444 BYTE* s2 = s;
445 BYTE* s2end = s2 + w*4;
446 DWORD* d2 = (DWORD*)d;
447 for(; s2 < s2end; s2 += 4, d2++)
449 if(s2[3] < 0xff)
451 DWORD bd =0x00000100 -( (DWORD) s2[3]);
452 DWORD B = ((*((DWORD*)s2)&0x000000ff)<<8)/bd;
453 DWORD V = ((*((DWORD*)s2)&0x0000ff00)/bd)<<8;
454 DWORD R = (((*((DWORD*)s2)&0x00ff0000)>>8)/bd)<<16;
455 *d2 = B | V | R
456 | (0xff000000-(*((DWORD*)s2)&0xff000000))&0xff000000;
460 break;
461 case MSP_RGB32:
462 case MSP_AYUV: //ToDo: fix me MSP_VUYA indeed?
463 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
465 BYTE* s2 = s;
466 BYTE* s2end = s2 + w*4;
467 DWORD* d2 = (DWORD*)d;
468 for(; s2 < s2end; s2 += 4, d2++)
470 #ifdef _WIN64
471 DWORD ia = 256-s2[3];
472 if(s2[3] < 0xff) {
473 *d2 = ((((*d2&0x00ff00ff)*s2[3])>>8) + (((*((DWORD*)s2)&0x00ff00ff)*ia)>>8)&0x00ff00ff)
474 | ((((*d2&0x0000ff00)*s2[3])>>8) + (((*((DWORD*)s2)&0x0000ff00)*ia)>>8)&0x0000ff00);
476 #else
477 if(s2[3] < 0xff)
479 *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
480 | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00);
482 #endif
485 break;
486 case MSP_RGB24:
487 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
489 BYTE* s2 = s;
490 BYTE* s2end = s2 + w*4;
491 BYTE* d2 = d;
492 for(; s2 < s2end; s2 += 4, d2 += 3)
494 if(s2[3] < 0xff)
496 d2[0] = ((d2[0]*s2[3])>>8) + s2[0];
497 d2[1] = ((d2[1]*s2[3])>>8) + s2[1];
498 d2[2] = ((d2[2]*s2[3])>>8) + s2[2];
502 break;
503 case MSP_RGB16:
504 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
506 BYTE* s2 = s;
507 BYTE* s2end = s2 + w*4;
508 WORD* d2 = (WORD*)d;
509 for(; s2 < s2end; s2 += 4, d2++)
511 if(s2[3] < 0x1f)
513 *d2 = (WORD)((((((*d2&0xf81f)*s2[3])>>5) + (*(DWORD*)s2&0xf81f))&0xf81f)
514 | (((((*d2&0x07e0)*s2[3])>>5) + (*(DWORD*)s2&0x07e0))&0x07e0));
515 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
516 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
517 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
522 break;
523 case MSP_RGB15:
524 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
526 BYTE* s2 = s;
527 BYTE* s2end = s2 + w*4;
528 WORD* d2 = (WORD*)d;
529 for(; s2 < s2end; s2 += 4, d2++)
531 if(s2[3] < 0x1f)
533 *d2 = (WORD)((((((*d2&0x7c1f)*s2[3])>>5) + (*(DWORD*)s2&0x7c1f))&0x7c1f)
534 | (((((*d2&0x03e0)*s2[3])>>5) + (*(DWORD*)s2&0x03e0))&0x03e0));
535 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
536 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
537 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
542 break;
543 case MSP_YUY2:
544 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
546 unsigned int ia, c;
547 BYTE* s2 = s;
548 BYTE* s2end = s2 + w*4;
549 DWORD* d2 = (DWORD*)d;
550 for(; s2 < s2end; s2 += 8, d2++)
552 ia = (s2[3]+s2[7])>>1;
553 if(ia < 0xff)
555 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
556 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
557 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
558 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
559 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
561 ia = (ia<<24)|(s2[7]<<16)|(ia<<8)|s2[3];
562 c = (s2[4]<<24)|(s2[5]<<16)|(s2[0]<<8)|s2[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
563 __asm
565 mov edi, d2
566 pxor mm0, mm0
567 movd mm2, c
568 punpcklbw mm2, mm0
569 movd mm3, [edi]
570 punpcklbw mm3, mm0
571 movd mm4, ia
572 punpcklbw mm4, mm0
573 psraw mm4, 1 //or else, overflow because psraw shift in sign bit
574 pmullw mm3, mm4
575 psraw mm3, 7
576 paddsw mm3, mm2
577 packuswb mm3, mm3
578 movd [edi], mm3
583 __asm emms;
584 break;
585 case MSP_YV12:
586 case MSP_IYUV:
588 //dst.pitch = abs(dst.pitch);
589 int h2 = h/2;
590 if(!dst.pitchUV)
592 dst.pitchUV = abs(dst.pitch)/2;
594 if(!dst.bitsU || !dst.bitsV)
596 dst.bitsU = (BYTE*)dst.bits + abs(dst.pitch)*dst.h;
597 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
598 if(dst.type == MSP_YV12)
600 BYTE* p = dst.bitsU;
601 dst.bitsU = dst.bitsV;
602 dst.bitsV = p;
605 BYTE* dd[2];
606 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
607 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
608 if(rd.top > rd.bottom)
610 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
611 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
612 dst.pitchUV = -dst.pitchUV;
615 BYTE* src_origin= (BYTE*)src.bits + src.pitch*rs.top + rs.left;
616 BYTE *s = src_origin;
618 BYTE* ss[2];
619 ss[0] = src_origin + src.pitch*src.h*2;//U
620 ss[1] = src_origin + src.pitch*src.h*3;//V
622 //equivalent:
623 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
624 // && (reinterpret_cast<intptr_t>(d2)&15)==0 )
625 if( ((reinterpret_cast<intptr_t>(s) | static_cast<intptr_t>(src.pitch) |
626 reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
628 for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
630 BYTE* sa = s;
631 BYTE* s2 = s + src.pitch*src.h;
632 BYTE* s2end_mod16 = s2 + (w&~15);
633 BYTE* s2end = s2 + w;
634 BYTE* d2 = d;
636 for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=16)
638 __asm
640 //important!
641 mov edi, d2
642 mov esi, sa
644 movaps XMM3,[edi]
645 xorps XMM0,XMM0
646 movaps XMM4,XMM3
647 punpcklbw XMM4,XMM0
649 movaps XMM1,[esi]
650 movaps XMM5,XMM1
651 punpcklbw XMM5,XMM0
652 pmullw XMM4,XMM5
653 psrlw XMM4,8
655 punpckhbw XMM1,XMM0
656 punpckhbw XMM3,XMM0
657 pmullw XMM1,XMM3
658 psrlw XMM1,8
660 packuswb XMM4,XMM1
661 mov esi, s2
662 movaps XMM3,[esi]
663 paddusb XMM4,XMM3
664 movntps [edi],XMM4
667 for(; s2 < s2end; s2++, sa++, d2++)
669 d2[0] = (((d2[0])*sa[0])>>8) + s2[0];
673 else //fix me: only a workaround for non-mod-16 size video
675 for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
677 BYTE* sa = s;
678 BYTE* s2 = s + src.pitch*src.h;
679 BYTE* s2end_mod16 = s2 + (w&~15);
680 BYTE* s2end = s2 + w;
681 BYTE* d2 = d;
682 for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
684 //if(s2[3] < 0xff)
686 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
687 d2[0] = (((d2[0])*sa[0])>>8) + s2[0];
692 //equivalent:
693 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa2)&15)==0
694 // && (reinterpret_cast<intptr_t>(d2)&7)==0 )
695 if( ((reinterpret_cast<intptr_t>(ss[0]) | reinterpret_cast<intptr_t>(ss[1]) |
696 reinterpret_cast<intptr_t>(dd[0]) | reinterpret_cast<intptr_t>(dd[1]) |
697 reinterpret_cast<intptr_t>(src_origin) | static_cast<intptr_t>(src.pitch) |
698 (static_cast<intptr_t>(dst.pitchUV)&7) ) & 15 )==0 )
700 for(int i = 0; i < 2; i++)
702 BYTE* s_uv = ss[i];
703 BYTE* sa = src_origin;
704 d = dd[i];
705 int pitch = src.pitch;
706 for(int j = 0; j < h2; j++, s_uv += src.pitch*2, sa += src.pitch*2, d += dst.pitchUV)
708 BYTE* s2 = s_uv;
709 BYTE* sa2 = sa;
710 BYTE* s2end_mod16 = s2 + (w&~15);
711 BYTE* s2end = s2 + w;
712 BYTE* d2 = d;
714 for(; s2 < s2end_mod16; s2 += 16, sa2 += 16, d2+=8)
716 SSE2_ALPHA_BLT_UV(d2, sa2, s2, pitch)
718 for(; s2 < s2end; s2+=2, sa2+=2, d2++)
720 unsigned int ia = (sa2[0]+ +sa2[1]+
721 sa2[0+src.pitch]+sa2[1+src.pitch])>>2;
722 *d2 = (((*d2)*ia)>>8) + ((s2[0] +s2[1]+
723 s2[src.pitch]+s2[1+src.pitch] )>>2);
728 else//fix me: only a workaround for non-mod-16 size video
730 for(int i = 0; i < 2; i++)
732 BYTE* s_uv = ss[i];
733 BYTE* sa = src_origin;
734 d = dd[i];
735 int pitch = src.pitch;
736 for(int j = 0; j < h2; j++, s_uv += src.pitch*2, sa += src.pitch*2, d += dst.pitchUV)
738 BYTE* s2 = s_uv;
739 BYTE* sa2 = sa;
740 BYTE* s2end_mod16 = s2 + (w&~15);
741 BYTE* s2end = s2 + w;
742 BYTE* d2 = d;
743 for(; s2 < s2end; s2 += 2, sa2 += 2, d2++)
745 unsigned int ia = (sa2[0]+ +sa2[1]+
746 sa2[0+src.pitch]+sa2[1+src.pitch])>>2;
747 //if(ia < 0xff)
749 // *d2 = (((*d2-0x80)*ia)>>8) + ((s2[0] +s2[1]
750 // s2[src.pitch]+s2[1+src.pitch] )>>2);
751 *d2 = (((*d2)*ia)>>8) + ((s2[0] +s2[1]+
752 s2[src.pitch]+s2[1+src.pitch] )>>2);
758 __asm emms;
760 break;
761 case MSP_P010:
762 case MSP_P016:
764 //dst.pitch = abs(dst.pitch);
765 int h2 = h/2;
766 if(!dst.pitchUV)
768 dst.pitchUV = abs(dst.pitch);
770 if(!dst.bitsU || !dst.bitsV)
772 dst.bitsU = (BYTE*)dst.bits + abs(dst.pitch)*dst.h;
773 dst.bitsV = dst.bitsU + 2;
775 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
776 if(rd.top > rd.bottom)
778 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
779 dst.pitchUV = -dst.pitchUV;
782 BYTE* src_origin= (BYTE*)src.bits + src.pitch*rs.top + rs.left;
783 BYTE *s = src_origin;
785 BYTE* ss[2];
786 ss[0] = src_origin + src.pitch*src.h*2;//U
787 ss[1] = src_origin + src.pitch*src.h*3;//V
789 // equivalent:
790 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
791 // && (reinterpret_cast<intptr_t>(d2)&15)==0 )
792 if( ((reinterpret_cast<intptr_t>(s) | static_cast<intptr_t>(src.pitch) |
793 reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
795 for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
797 BYTE* sa = s;
798 BYTE* s2 = s + src.pitch*src.h;
799 BYTE* s2end_mod16 = s2 + (w&~15);
800 BYTE* s2end = s2 + w;
801 BYTE* d2 = d;
803 for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=16)
805 //important!
806 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(sa) );
807 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(s2) );
808 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(d2) );
809 __m128i lo = _mm_setzero_si128();
810 lo = _mm_unpacklo_epi8(lo, alpha);
811 dst_y = _mm_mulhi_epu16(dst_y, lo);
812 lo = _mm_setzero_si128();
813 lo = _mm_unpacklo_epi8(lo, src_y);
814 dst_y = _mm_adds_epu16(dst_y, lo);
815 _mm_store_si128( reinterpret_cast<__m128i*>(d2), dst_y );
817 d2 += 16;
818 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(d2) );
819 lo = _mm_setzero_si128();
820 lo = _mm_unpackhi_epi8(lo, alpha);
821 dst_y = _mm_mulhi_epu16(dst_y, lo);
822 lo = _mm_setzero_si128();
823 lo = _mm_unpackhi_epi8(lo, src_y);
824 dst_y = _mm_adds_epu16(dst_y, lo);
825 _mm_store_si128( reinterpret_cast<__m128i*>(d2), dst_y );
827 for( WORD* d3=reinterpret_cast<WORD*>(d2); s2 < s2end; s2++, sa++, d3++)
829 d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
833 else //fix me: only a workaround for non-mod-16 size video
835 for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
837 BYTE* sa = s;
838 BYTE* s2 = s + src.pitch*src.h;
839 BYTE* s2end_mod16 = s2 + (w&~15);
840 BYTE* s2end = s2 + w;
841 WORD* d2 = reinterpret_cast<WORD*>(d);
842 for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
844 //if(s2[3] < 0xff)
846 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
847 d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
852 // // equivalent:
853 // // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa2)&15)==0
854 // // && (reinterpret_cast<intptr_t>(d2)&7)==0 )
855 // if( ((reinterpret_cast<intptr_t>(ss[0]) | reinterpret_cast<intptr_t>(ss[1]) |
856 // reinterpret_cast<intptr_t>(ddUV) |
857 // reinterpret_cast<intptr_t>(src_origin) | static_cast<intptr_t>(src.pitch) |
858 // (static_cast<intptr_t>(dst.pitchUV)&7) ) & 15 )==0 )
859 // {
860 // BYTE* s_u = ss[0];
861 // BYTE* s_v = ss[1];
862 // BYTE* sa = src_origin;
863 // BYTE* d = ddUV;
864 // int pitch = src.pitch;
865 // for(int j = 0; j < h2; j++, s_u += src.pitch*2, s_v += src.pitch*2, sa += src.pitch*2, d += dst.pitchUV)
866 // {
867 // BYTE* s_u2 = s_u;
868 // BYTE* sa2 = sa;
869 // BYTE* s_u2end_mod16 = s_u2 + (w&~15);
870 // BYTE* s_u2end = s_u2 + w;
871 // BYTE* d2 = d;
872 // BYTE* s_v2 = s_v;
874 // for(; s_u2 < s_u2end_mod16; s_u2 += 8, s_v2+=8, sa2 += 8, d2+=16)
875 // {
876 // __m128i dst = _mm_load_si128( reinterpret_cast<const __m128i*>(d2) );
877 // __m128i alpha1 = _mm_load_si128( reinterpret_cast<const __m128i*>(sa2) );
878 // __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(sa2+src.pitch) );
880 // __m128i temp1 = _mm_setzero_si128();
881 // temp1 = _mm_unpacklo_epi8(alpha1, temp1);
882 // __m128i temp2 = _mm_setzero_si128();
883 // temp2 = _mm_unpacklo_epi8(alpha1, temp2);
885 // temp1 = _mm_adds_epu16(temp1, temp2);
887 // temp2 = _mm_srai_epi32(temp1, 16);
888 // temp1 = _mm_adds_epu16(temp1, temp2);
889 // temp1 = _mm_srli_epi32(temp1, 22);
890 // temp2 = _mm_srai_epi32(temp1, 16);
891 // temp1 = _mm_adds_epu16(temp1, temp2);
893 // dst = _mm_mulhi_epu16(dst, temp1);
896 // __m128i su1 = _mm_load_si128( reinterpret_cast<const __m128i*>(s_u2) );
897 // __m128i su2 = _mm_load_si128( reinterpret_cast<const __m128i*>(s_u2+src.pitch) );
898 // __m128i sv1 = _mm_load_si128( reinterpret_cast<const __m128i*>(s_v2) );
899 // __m128i sv2 = _mm_load_si128( reinterpret_cast<const __m128i*>(s_u2+src.pitch) );
901 ///*
902 // su1 = _mm_unpacklo_epi8(su1, zero);
903 // su2 = _mm_unpacklo_epi8(su2, zero);
904 // sv1 = _mm_unpacklo_epi8(sv1, zero);
905 // sv2 = _mm_unpacklo_epi8(sv2, zero);
906 // alpha = _mm_unpacklo_epi8(alpha, zero);
907 // alpha2 = _mm_unpacklo_epi8(alpha2, zero);
909 // su1 = _mm_adds_epu16(su1, su2);
910 // sv1 = _mm_adds_epu16(sv1, sv2);
911 // alpha = _mm_adds_epu16(alpha, alpha2);
913 // su2 = _mm_srli_epi32(su1, 16);
914 // sv2 = _mm_srli_epi32(sv1, 16);
915 // alpha2 = _mm_srli_epi32(alpha, 16);
917 // su1 = _mm_adds_epu16(su1,su2);
918 // sv1 = _mm_adds_epu16(sv1,sv2);
919 // alpha = _mm_adds_epu16(alpha,alpha2);
921 // su1 = _mm_srai_epi32(su1, 16);
922 // sv1 = _mm_srai_epi32(sv1, 16);
923 // sv1 = _mm_srli_epi32(sv1, 16);
925 // su1 = _mm_add_epi32(su1,sv1);
927 // alpha2 = _mm_srai_epi32(alpha, 16);
928 // alpha = _mm_srli_epi32(alpha2, 16);
929 // alpha = _mm_add_epi32(alpha,alpha2);
930 // alpha = _mm_srli_epi16(alpha, 6);
932 // dst = _mm_mulhi_epu16(dst, alpha);
933 // dst = _mm_adds_epu16(dst, su1);
934 // _mm_store_si128( reinterpret_cast<__m128i*>(d2), dst ); */
935 // }
936 // for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, s_v2+=2, sa2+=2, d3++)
937 // {
938 // unsigned int ia = ( sa2[0]+ sa2[1]+
939 // sa2[0+src.pitch]+sa2[1+src.pitch]);
940 // *d3 = (((*d3)*ia)>>8) + ((s_u2[0] + s_u2[1]+
941 // s_u2[src.pitch]+s_u2[1+src.pitch] ));
942 // d3++;
943 // *d3 = (((*d3)*ia)>>8) + ((s_v2[0] + s_v2[1]+
944 // s_v2[src.pitch]+s_v2[1+src.pitch] ));
945 // }
946 // }
947 // }
948 // else//fix me: only a workaround for non-mod-16 size video
950 BYTE* s_u = ss[0];
951 BYTE* s_v = ss[1];
952 BYTE* sa = src_origin;
953 BYTE* d = ddUV;
954 int pitch = src.pitch;
955 for(int j = 0; j < h2; j++, s_u += src.pitch*2, s_v += src.pitch*2, sa += src.pitch*2, d += dst.pitchUV)
957 BYTE* s_u2 = s_u;
958 BYTE* sa2 = sa;
959 BYTE* s_u2end_mod16 = s_u2 + (w&~15);
960 BYTE* s_u2end = s_u2 + w;
961 BYTE* d2 = d;
962 BYTE* s_v2 = s_v;
964 for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, s_v2+=2, sa2+=2, d3++)
966 unsigned int ia = (
967 sa2[0]+ sa2[1]+
968 sa2[0+src.pitch]+sa2[1+src.pitch]);
969 *d3 = (((*d3)*ia)>>10) + ((
970 s_u2[0] + s_u2[1]+
971 s_u2[src.pitch]+s_u2[1+src.pitch] )<<6);
972 d3++;
973 *d3 = (((*d3)*ia)>>10) + ((
974 s_v2[0] + s_v2[1]+
975 s_v2[src.pitch]+s_v2[1+src.pitch] )<<6);
979 __asm emms;
981 break;
982 default:
983 return E_NOTIMPL;
984 break;
987 //emmsÒª40¸öcpuÖÜÆÚ
988 //__asm emms;
989 return S_OK;
992 STDMETHODIMP CMemSubPic::AlphaBltAyuv_Yv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
994 const SubPicDesc& src = m_spd;
995 SubPicDesc dst = *pTarget; // copy, because we might modify it
997 CRect rs(*pSrc), rd(*pDst);
999 if(dst.h < 0) {
1000 dst.h = -dst.h;
1001 rd.bottom = dst.h - rd.bottom;
1002 rd.top = dst.h - rd.top;
1005 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1006 return E_INVALIDARG;
1009 int w = rs.Width(), h = rs.Height();
1011 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1012 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1014 if(rd.top > rd.bottom) {
1015 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1017 dst.pitch = -dst.pitch;
1020 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1021 BYTE* s2 = s;
1022 BYTE* s2end = s2 + w*4;
1023 BYTE* d2 = d;
1024 for(; s2 < s2end; s2 += 4, d2++) {
1025 if(s2[3] < 0xff) {
1026 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1030 dst.pitch = abs(dst.pitch);
1032 int h2 = h/2;
1034 if(!dst.pitchUV) {
1035 dst.pitchUV = dst.pitch/2;
1038 BYTE* ss[2];
1039 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1040 ss[1] = ss[0] + 4;
1042 if(!dst.bitsU || !dst.bitsV) {
1043 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1044 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
1046 if(dst.type == MSP_YV12) {
1047 BYTE* p = dst.bitsU;
1048 dst.bitsU = dst.bitsV;
1049 dst.bitsV = p;
1053 BYTE* dd[2];
1054 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
1055 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
1057 if(rd.top > rd.bottom) {
1058 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1059 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1060 dst.pitchUV = -dst.pitchUV;
1063 for(ptrdiff_t i = 0; i < 2; i++) {
1064 s = ss[i];
1065 d = dd[i];
1066 BYTE* is = ss[1-i];
1067 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, is += src.pitch*2) {
1068 BYTE* s2 = s;
1069 BYTE* s2end = s2 + w*4;
1070 BYTE* d2 = d;
1071 BYTE* is2 = is;
1072 for(; s2 < s2end; s2 += 8, d2++, is2 += 8) {
1073 unsigned int ia = (s2[3]+s2[3+src.pitch]+is2[3]+is2[3+src.pitch])>>2;
1074 if(ia < 0xff) {
1075 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1081 return S_OK;
1084 STDMETHODIMP CMemSubPic::SetDirtyRectEx(CAtlList<CRect>* dirtyRectList )
1086 //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1087 if(dirtyRectList!=NULL)
1089 POSITION pos = dirtyRectList->GetHeadPosition();
1090 if(m_spd.type == MSP_AY11 || m_alpha_blt_dst_type==MSP_IYUV || m_alpha_blt_dst_type==MSP_YV12
1091 || m_alpha_blt_dst_type==MSP_P010 || m_alpha_blt_dst_type==MSP_P016)
1093 while(pos!=NULL)
1095 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1096 cRectSrc.left &= ~15;
1097 cRectSrc.right = (cRectSrc.right+15)&~15;
1098 cRectSrc.top &= ~1;
1099 cRectSrc.bottom = (cRectSrc.bottom+1)&~1;
1102 else if(m_spd.type == MSP_AUYV || m_alpha_blt_dst_type==MSP_YUY2)
1104 while(pos!=NULL)
1106 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1107 cRectSrc.left &= ~3;
1108 cRectSrc.right = (cRectSrc.right+3)&~3;
1112 return __super::SetDirtyRectEx(dirtyRectList);
1116 // CMemSubPicAllocator
1119 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type, SIZE maxsize, int type/*=-1*/)
1120 : CSubPicExAllocatorImpl(maxsize, false, false)
1121 , m_alpha_blt_dst_type(alpha_blt_dst_type)
1122 , m_maxsize(maxsize)
1123 , m_type(type)
1125 if(m_type==-1)
1127 switch(alpha_blt_dst_type)
1129 case MSP_YUY2:
1130 m_type = MSP_AUYV;
1131 break;
1132 case MSP_AYUV:
1133 m_type = MSP_AYUV;
1134 break;
1135 case MSP_IYUV:
1136 case MSP_YV12:
1137 m_type = MSP_AY11;
1138 break;
1139 default:
1140 m_type = MSP_RGBA;
1141 break;
1146 // ISubPicAllocatorImpl
1148 bool CMemSubPicAllocator::AllocEx(bool fStatic, ISubPicEx** ppSubPic)
1150 if(!ppSubPic) {
1151 return false;
1153 SubPicDesc spd;
1154 spd.w = m_maxsize.cx;
1155 spd.h = m_maxsize.cy;
1156 spd.bpp = 32;
1157 spd.pitch = (spd.w*spd.bpp)>>3;
1158 spd.type = m_type;
1159 spd.bits = DNew BYTE[spd.pitch*spd.h];
1160 if(!spd.bits) {
1161 return false;
1163 *ppSubPic = DNew CMemSubPic(spd, m_alpha_blt_dst_type);
1164 if(!(*ppSubPic)) {
1165 return false;
1167 (*ppSubPic)->AddRef();
1168 return true;