2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
23 #include "MemSubPic.h"
24 #include "color_conv_table.h"
26 #define MIX_4_PIX_YV12(dst, zero_128i, c_128i, a_128i) \
28 __m128i d_128i = _mm_cvtsi32_si128(*dst); \
29 _MIX_4_PIX_YV12(d_128i, zero_128i, c_128i, a_128i) \
30 *dst = (DWORD)_mm_cvtsi128_si32(d_128i); \
33 #define _MIX_4_PIX_YV12(dst_128i, zero_128i, c_128i, a_128i) \
35 dst_128i = _mm_unpacklo_epi8(dst_128i, zero_128i); \
36 dst_128i = _mm_unpacklo_epi16(dst_128i, c_128i); \
37 dst_128i = _mm_madd_epi16(dst_128i, a_128i); \
38 dst_128i = _mm_srli_epi32(dst_128i, 8); \
39 dst_128i = _mm_packs_epi32(dst_128i, dst_128i); \
40 dst_128i = _mm_packus_epi16(dst_128i, dst_128i); \
43 #define AVERAGE_4_PIX(a,b) \
51 #define SSE2_ALPHA_BLT_UV(dst, alpha_mask, src, src_pitch) \
52 __asm mov eax,src_pitch \
54 __asm xorps XMM0,XMM0 \
55 __asm mov esi, alpha_mask \
56 __asm movaps XMM1,[esi] \
58 __asm movaps XMM2,[esi] \
60 __asm AVERAGE_4_PIX(XMM1, XMM2) \
62 __asm movlps XMM3,[edi] \
63 __asm punpcklbw XMM3,XMM0 \
64 __asm pmullw XMM3,XMM1 \
68 __asm movaps XMM1,[esi] \
70 __asm movaps XMM2,[esi] \
71 __asm AVERAGE_4_PIX(XMM1, XMM2) \
73 __asm paddw XMM3,XMM1 \
74 __asm packuswb XMM3,XMM0 \
76 __asm movdq2q MM0, XMM3 \
83 CMemSubPic::CMemSubPic(SubPicDesc
& spd
, int alpha_blt_dst_type
)
84 : m_spd(spd
), m_alpha_blt_dst_type(alpha_blt_dst_type
)
86 m_maxsize
.SetSize(spd
.w
, spd
.h
);
87 // m_rcDirty.SetRect(0, 0, spd.w, spd.h);
88 CRect
allSpd(0,0,spd
.w
, spd
.h
);
89 m_rectListDirty
.AddTail(allSpd
);
92 CMemSubPic::~CMemSubPic()
94 delete [] m_spd
.bits
, m_spd
.bits
= NULL
;
99 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
101 return (void*)&m_spd
;
104 STDMETHODIMP
CMemSubPic::GetDesc(SubPicDesc
& spd
) const
106 spd
.type
= m_spd
.type
;
110 spd
.pitch
= m_spd
.pitch
;
111 spd
.bits
= m_spd
.bits
;
112 spd
.bitsU
= m_spd
.bitsU
;
113 spd
.bitsV
= m_spd
.bitsV
;
114 spd
.vidrect
= m_vidrect
;
118 STDMETHODIMP
CMemSubPic::CopyTo(ISubPicEx
* pSubPic
)
121 if(FAILED(hr
= __super::CopyTo(pSubPic
))) {
126 if(FAILED(GetDesc(src
)) || FAILED(pSubPic
->GetDesc(dst
))) {
129 while(!m_rectListDirty
.IsEmpty())
131 CRect
& cRect
= m_rectListDirty
.GetHead();
132 int w
= cRect
.Width(), h
= cRect
.Height();
133 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*cRect
.top
+ cRect
.left
*4;
134 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*cRect
.top
+ cRect
.left
*4;
135 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
141 STDMETHODIMP
CMemSubPic::ClearDirtyRect(DWORD color
)
143 if(m_rectListDirty
.IsEmpty()) {
146 while(!m_rectListDirty
.IsEmpty())
148 //pDirtyRect = m_rectListDirty.RemoveHead();
149 CRect
& dirtyRect
= m_rectListDirty
.RemoveTail();
150 BYTE
* p
= (BYTE
*)m_spd
.bits
+ m_spd
.pitch
*(dirtyRect
.top
) + dirtyRect
.left
*(m_spd
.bpp
>>3);
151 int w
= dirtyRect
.Width();
152 if(m_spd
.type
!=MSP_AY11
)
154 for(int j
= 0, h
= dirtyRect
.Height(); j
< h
; j
++, p
+= m_spd
.pitch
)
157 memsetd(p
, color
, w
*4); // nya
175 for(int j
= 0, h
= dirtyRect
.Height(); j
< h
; j
++, p
+= m_spd
.pitch
)
177 // memsetd(p, 0, m_rcDirty.Width());
178 //DbgLog((LOG_TRACE, 3, "w:%d", w));
179 //w = pDirtyRect->Width();
181 memset(p
+m_spd
.h
*m_spd
.pitch
, 0, w
);
182 memset(p
+m_spd
.h
*m_spd
.pitch
*2, 0, w
);
183 memset(p
+m_spd
.h
*m_spd
.pitch
*3, 0, w
);
187 m_rectListDirty
.RemoveAll();
191 STDMETHODIMP
CMemSubPic::Lock(SubPicDesc
& spd
)
196 STDMETHODIMP
CMemSubPic::Unlock( CAtlList
<CRect
>* dirtyRectList
)
198 int src_type
= m_spd
.type
;
199 int dst_type
= m_alpha_blt_dst_type
;
200 if( (src_type
==MSP_RGBA
&& (dst_type
== MSP_RGB32
||
201 dst_type
== MSP_RGB24
||
202 dst_type
== MSP_RGB16
||
203 dst_type
== MSP_RGB15
))
205 (src_type
==MSP_AUYV
&& dst_type
== MSP_YUY2
)//ToDo: fix me MSP_AYUV
207 (src_type
==MSP_AYUV
&& dst_type
== MSP_AYUV
)
209 (src_type
==MSP_AY11
&& (dst_type
== MSP_IYUV
||
210 dst_type
== MSP_YV12
||
211 dst_type
== MSP_P010
||
212 dst_type
== MSP_P016
)))
214 return UnlockOther(dirtyRectList
);
216 else if(src_type
==MSP_RGBA
&& (dst_type
== MSP_YUY2
||
217 dst_type
== MSP_AYUV
|| //ToDo: fix me MSP_AYUV
218 dst_type
== MSP_IYUV
||
219 dst_type
== MSP_YV12
))
221 return UnlockRGBA_YUV(dirtyRectList
);
226 STDMETHODIMP
CMemSubPic::UnlockOther(CAtlList
<CRect
>* dirtyRectList
)
228 SetDirtyRectEx(dirtyRectList
);
229 if(m_rectListDirty
.IsEmpty()) {
233 POSITION pos
= m_rectListDirty
.GetHeadPosition();
236 const CRect
& cRect
= m_rectListDirty
.GetNext(pos
);
237 int w
= cRect
.Width(), h
= cRect
.Height();
238 BYTE
* top
= (BYTE
*)m_spd
.bits
+ m_spd
.pitch
*(cRect
.top
) + cRect
.left
*4;
239 BYTE
* bottom
= top
+ m_spd
.pitch
*h
;
240 if(m_alpha_blt_dst_type
== MSP_RGB16
)
242 for(; top
< bottom
; top
+= m_spd
.pitch
)
244 DWORD
* s
= (DWORD
*)top
;
248 *s
= ((*s
>>3)&0x1f000000)|((*s
>>8)&0xf800)|((*s
>>5)&0x07e0)|((*s
>>3)&0x001f);
249 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
253 else if(m_alpha_blt_dst_type
== MSP_RGB15
)
255 for(; top
< bottom
; top
+= m_spd
.pitch
)
257 DWORD
* s
= (DWORD
*)top
;
261 *s
= ((*s
>>3)&0x1f000000)|((*s
>>9)&0x7c00)|((*s
>>6)&0x03e0)|((*s
>>3)&0x001f);
262 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
266 else if(m_alpha_blt_dst_type
== MSP_YUY2
)
268 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top
, m_spd
.pitch
*(h
-1)) );
270 for(BYTE
* tempTop
=top
; tempTop
< bottom
; tempTop
+= m_spd
.pitch
)
274 for(; s
< e
; s
+=8) // AUYV AUYV -> AxYU AxYV
276 s
[4] = (s
[0] + s
[4])>>1;
277 s
[0] = (s
[2] + s
[6])>>1;
281 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top
, m_spd
.pitch
*(h
-1)) );
283 else if(m_alpha_blt_dst_type
== MSP_YV12
|| m_alpha_blt_dst_type
== MSP_IYUV
|| m_alpha_blt_dst_type
== MSP_YV12
284 || m_alpha_blt_dst_type
== MSP_P010
|| m_alpha_blt_dst_type
== MSP_P016
)
292 STDMETHODIMP
CMemSubPic::UnlockRGBA_YUV(CAtlList
<CRect
>* dirtyRectList
)
294 SetDirtyRectEx(dirtyRectList
);
295 if(m_rectListDirty
.IsEmpty()) {
299 const ColorConvTable
*conv_table
= ColorConvTable::GetDefaultColorConvTable();
300 const int *c2y_yb
= conv_table
->c2y_yb
;
301 const int *c2y_yg
= conv_table
->c2y_yg
;
302 const int *c2y_yr
= conv_table
->c2y_yr
;
303 const int cy_cy2
= conv_table
->cy_cy2
;
304 const int c2y_cu
= conv_table
->c2y_cu
;
305 const int c2y_cv
= conv_table
->c2y_cv
;
306 const int cy_cy
= conv_table
->cy_cy
;
307 const unsigned char* Clip
= conv_table
->Clip
;
309 POSITION pos
= m_rectListDirty
.GetHeadPosition();
312 const CRect
& cRect
= m_rectListDirty
.GetNext(pos
);
313 int w
= cRect
.Width(), h
= cRect
.Height();
315 BYTE
* top
= (BYTE
*)m_spd
.bits
+ m_spd
.pitch
*cRect
.top
+ cRect
.left
*4;
316 BYTE
* bottom
= top
+ m_spd
.pitch
*h
;
318 if(m_alpha_blt_dst_type
== MSP_YUY2
|| m_alpha_blt_dst_type
== MSP_YV12
|| m_alpha_blt_dst_type
== MSP_IYUV
) {
319 for(; top
< bottom
; top
+= m_spd
.pitch
) {
322 for(; s
< e
; s
+=8) { // ARGB ARGB -> AxYU AxYV
323 if((s
[3]+s
[7]) < 0x1fe) {
324 int a
= 0x200 - (s
[3]+s
[7]);
327 s
[1] = (c2y_yb
[s
[0]] + c2y_yg
[s
[1]] + c2y_yr
[s
[2]] + 0x10*a
+ 0x8000) >> 16;
328 s
[5] = (c2y_yb
[s
[4]] + c2y_yg
[s
[5]] + c2y_yr
[s
[6]] + 0x10*a
+ 0x8000) >> 16;
330 int scaled_y
= (s
[1]+s
[5]-32) * cy_cy2
;
332 s
[0] = Clip
[(((((s
[0]+s
[4])<<15) - scaled_y
) >> 10) * c2y_cu
+ 0x80*a
+ 0x8000) >> 16];
333 s
[4] = Clip
[(((((s
[2]+s
[6])<<15) - scaled_y
) >> 10) * c2y_cv
+ 0x80*a
+ 0x8000) >> 16];
341 else if(m_alpha_blt_dst_type
== MSP_AYUV
) {
342 for(; top
< bottom
; top
+= m_spd
.pitch
) {
345 for(; s
< e
; s
+=4) { // ARGB -> AYUV
347 int a
= 0x100 - s
[3];
351 int y
= (c2y_yb
[s
[0]] + c2y_yg
[s
[1]] + c2y_yr
[s
[2]] + 0x10*a
+ 0x8000) >> 16;
352 int scaled_y
= (y
-32) * cy_cy
;
353 s
[1] = Clip
[((((s
[0]<<16) - scaled_y
) >> 10) * c2y_cu
+ 0x80*a
+ 0x8000) >> 16];
354 s
[0] = Clip
[((((s
[2]<<16) - scaled_y
) >> 10) * c2y_cv
+ 0x80*a
+ 0x8000) >> 16];
367 STDMETHODIMP
CMemSubPic::AlphaBlt( const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
369 if(!pSrc
|| !pDst
|| !pTarget
) {
372 int src_type
= m_spd
.type
;
373 int dst_type
= pTarget
->type
;
375 if( (src_type
==MSP_RGBA
&& (dst_type
== MSP_RGB32
||
376 dst_type
== MSP_RGB24
||
377 dst_type
== MSP_RGB16
||
378 dst_type
== MSP_RGB15
||
379 dst_type
== MSP_RGBA
||
380 dst_type
== MSP_YUY2
||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
381 dst_type
== MSP_AYUV
))
383 (src_type
==MSP_AUYV
&& dst_type
== MSP_YUY2
)//ToDo: fix me MSP_AYUV
385 (src_type
==MSP_AYUV
&& dst_type
== MSP_AYUV
)
387 (src_type
==MSP_AY11
&& (dst_type
== MSP_IYUV
||
388 dst_type
== MSP_YV12
||
389 dst_type
== MSP_P010
||
390 dst_type
== MSP_P016
)) )
392 return AlphaBltOther(pSrc
, pDst
, pTarget
);
394 else if( src_type
==MSP_RGBA
&& (dst_type
== MSP_IYUV
||
395 dst_type
== MSP_YV12
))
397 return AlphaBltAyuv_Yv12(pSrc
, pDst
, pTarget
);
402 STDMETHODIMP
CMemSubPic::AlphaBltOther(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
404 const SubPicDesc
& src
= m_spd
;
405 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
407 CRect
rs(*pSrc
), rd(*pDst
);
411 rd
.bottom
= dst
.h
- rd
.bottom
;
412 rd
.top
= dst
.h
- rd
.top
;
414 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
417 int w
= rs
.Width(), h
= rs
.Height();
418 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);//rs.left*4
419 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*rd
.top
+ ((rd
.left
*dst
.bpp
)>>3);
420 if(rd
.top
> rd
.bottom
)
422 if(dst
.type
== MSP_RGB32
|| dst
.type
== MSP_RGB24
423 || dst
.type
== MSP_RGB16
|| dst
.type
== MSP_RGB15
424 || dst
.type
== MSP_YUY2
|| dst
.type
== MSP_AYUV
)
426 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + (rd
.left
*dst
.bpp
>>3);
428 else if(dst
.type
== MSP_YV12
|| dst
.type
== MSP_IYUV
)
430 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + (rd
.left
*8>>3);
436 dst
.pitch
= -dst
.pitch
;
438 DbgLog((LOG_TRACE
, 5, TEXT("w=%d h=%d"), w
, h
));
442 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
445 BYTE
* s2end
= s2
+ w
*4;
446 DWORD
* d2
= (DWORD
*)d
;
447 for(; s2
< s2end
; s2
+= 4, d2
++)
451 DWORD bd
=0x00000100 -( (DWORD
) s2
[3]);
452 DWORD B
= ((*((DWORD
*)s2
)&0x000000ff)<<8)/bd
;
453 DWORD V
= ((*((DWORD
*)s2
)&0x0000ff00)/bd
)<<8;
454 DWORD R
= (((*((DWORD
*)s2
)&0x00ff0000)>>8)/bd
)<<16;
456 | (0xff000000-(*((DWORD
*)s2
)&0xff000000))&0xff000000;
462 case MSP_AYUV
: //ToDo: fix me MSP_VUYA indeed?
463 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
466 BYTE
* s2end
= s2
+ w
*4;
467 DWORD
* d2
= (DWORD
*)d
;
468 for(; s2
< s2end
; s2
+= 4, d2
++)
471 DWORD ia
= 256-s2
[3];
473 *d2
= ((((*d2
&0x00ff00ff)*s2
[3])>>8) + (((*((DWORD
*)s2
)&0x00ff00ff)*ia
)>>8)&0x00ff00ff)
474 | ((((*d2
&0x0000ff00)*s2
[3])>>8) + (((*((DWORD
*)s2
)&0x0000ff00)*ia
)>>8)&0x0000ff00);
479 *d2
= (((((*d2
&0x00ff00ff)*s2
[3])>>8) + (*((DWORD
*)s2
)&0x00ff00ff))&0x00ff00ff)
480 | (((((*d2
&0x0000ff00)*s2
[3])>>8) + (*((DWORD
*)s2
)&0x0000ff00))&0x0000ff00);
487 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
490 BYTE
* s2end
= s2
+ w
*4;
492 for(; s2
< s2end
; s2
+= 4, d2
+= 3)
496 d2
[0] = ((d2
[0]*s2
[3])>>8) + s2
[0];
497 d2
[1] = ((d2
[1]*s2
[3])>>8) + s2
[1];
498 d2
[2] = ((d2
[2]*s2
[3])>>8) + s2
[2];
504 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
507 BYTE
* s2end
= s2
+ w
*4;
509 for(; s2
< s2end
; s2
+= 4, d2
++)
513 *d2
= (WORD
)((((((*d2
&0xf81f)*s2
[3])>>5) + (*(DWORD
*)s2
&0xf81f))&0xf81f)
514 | (((((*d2
&0x07e0)*s2
[3])>>5) + (*(DWORD
*)s2
&0x07e0))&0x07e0));
515 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
516 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
517 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
524 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
527 BYTE
* s2end
= s2
+ w
*4;
529 for(; s2
< s2end
; s2
+= 4, d2
++)
533 *d2
= (WORD
)((((((*d2
&0x7c1f)*s2
[3])>>5) + (*(DWORD
*)s2
&0x7c1f))&0x7c1f)
534 | (((((*d2
&0x03e0)*s2
[3])>>5) + (*(DWORD
*)s2
&0x03e0))&0x03e0));
535 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
536 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
537 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
544 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
548 BYTE
* s2end
= s2
+ w
*4;
549 DWORD
* d2
= (DWORD
*)d
;
550 for(; s2
< s2end
; s2
+= 8, d2
++)
552 ia
= (s2
[3]+s2
[7])>>1;
555 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
556 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
557 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
558 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
559 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
561 ia
= (ia
<<24)|(s2
[7]<<16)|(ia
<<8)|s2
[3];
562 c
= (s2
[4]<<24)|(s2
[5]<<16)|(s2
[0]<<8)|s2
[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
573 psraw mm4
, 1 //or else, overflow because psraw shift in sign bit
588 //dst.pitch = abs(dst.pitch);
592 dst
.pitchUV
= abs(dst
.pitch
)/2;
594 if(!dst
.bitsU
|| !dst
.bitsV
)
596 dst
.bitsU
= (BYTE
*)dst
.bits
+ abs(dst
.pitch
)*dst
.h
;
597 dst
.bitsV
= dst
.bitsU
+ dst
.pitchUV
*dst
.h
/2;
598 if(dst
.type
== MSP_YV12
)
601 dst
.bitsU
= dst
.bitsV
;
606 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
607 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
608 if(rd
.top
> rd
.bottom
)
610 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
611 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
612 dst
.pitchUV
= -dst
.pitchUV
;
615 BYTE
* src_origin
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ rs
.left
;
616 BYTE
*s
= src_origin
;
619 ss
[0] = src_origin
+ src
.pitch
*src
.h
*2;//U
620 ss
[1] = src_origin
+ src
.pitch
*src
.h
*3;//V
623 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
624 // && (reinterpret_cast<intptr_t>(d2)&15)==0 )
625 if( ((reinterpret_cast<intptr_t>(s
) | static_cast<intptr_t>(src
.pitch
) |
626 reinterpret_cast<intptr_t>(d
) | static_cast<intptr_t>(dst
.pitch
) ) & 15 )==0 )
628 for(int i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
631 BYTE
* s2
= s
+ src
.pitch
*src
.h
;
632 BYTE
* s2end_mod16
= s2
+ (w
&~15);
633 BYTE
* s2end
= s2
+ w
;
636 for(; s2
< s2end_mod16
; s2
+=16, sa
+=16, d2
+=16)
667 for(; s2
< s2end
; s2
++, sa
++, d2
++)
669 d2
[0] = (((d2
[0])*sa
[0])>>8) + s2
[0];
673 else //fix me: only a workaround for non-mod-16 size video
675 for(int i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
678 BYTE
* s2
= s
+ src
.pitch
*src
.h
;
679 BYTE
* s2end_mod16
= s2
+ (w
&~15);
680 BYTE
* s2end
= s2
+ w
;
682 for(; s2
< s2end
; s2
+=1, sa
+=1, d2
+=1)
686 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
687 d2
[0] = (((d2
[0])*sa
[0])>>8) + s2
[0];
693 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa2)&15)==0
694 // && (reinterpret_cast<intptr_t>(d2)&7)==0 )
695 if( ((reinterpret_cast<intptr_t>(ss
[0]) | reinterpret_cast<intptr_t>(ss
[1]) |
696 reinterpret_cast<intptr_t>(dd
[0]) | reinterpret_cast<intptr_t>(dd
[1]) |
697 reinterpret_cast<intptr_t>(src_origin
) | static_cast<intptr_t>(src
.pitch
) |
698 (static_cast<intptr_t>(dst
.pitchUV
)&7) ) & 15 )==0 )
700 for(int i
= 0; i
< 2; i
++)
703 BYTE
* sa
= src_origin
;
705 int pitch
= src
.pitch
;
706 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
*2, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
710 BYTE
* s2end_mod16
= s2
+ (w
&~15);
711 BYTE
* s2end
= s2
+ w
;
714 for(; s2
< s2end_mod16
; s2
+= 16, sa2
+= 16, d2
+=8)
716 SSE2_ALPHA_BLT_UV(d2
, sa2
, s2
, pitch
)
718 for(; s2
< s2end
; s2
+=2, sa2
+=2, d2
++)
720 unsigned int ia
= (sa2
[0]+ +sa2
[1]+
721 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
])>>2;
722 *d2
= (((*d2
)*ia
)>>8) + ((s2
[0] +s2
[1]+
723 s2
[src
.pitch
]+s2
[1+src
.pitch
] )>>2);
728 else//fix me: only a workaround for non-mod-16 size video
730 for(int i
= 0; i
< 2; i
++)
733 BYTE
* sa
= src_origin
;
735 int pitch
= src
.pitch
;
736 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
*2, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
740 BYTE
* s2end_mod16
= s2
+ (w
&~15);
741 BYTE
* s2end
= s2
+ w
;
743 for(; s2
< s2end
; s2
+= 2, sa2
+= 2, d2
++)
745 unsigned int ia
= (sa2
[0]+ +sa2
[1]+
746 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
])>>2;
749 // *d2 = (((*d2-0x80)*ia)>>8) + ((s2[0] +s2[1]
750 // s2[src.pitch]+s2[1+src.pitch] )>>2);
751 *d2
= (((*d2
)*ia
)>>8) + ((s2
[0] +s2
[1]+
752 s2
[src
.pitch
]+s2
[1+src
.pitch
] )>>2);
764 //dst.pitch = abs(dst.pitch);
768 dst
.pitchUV
= abs(dst
.pitch
);
770 if(!dst
.bitsU
|| !dst
.bitsV
)
772 dst
.bitsU
= (BYTE
*)dst
.bits
+ abs(dst
.pitch
)*dst
.h
;
773 dst
.bitsV
= dst
.bitsU
+ 2;
775 BYTE
* ddUV
= dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
*2;
776 if(rd
.top
> rd
.bottom
)
778 ddUV
= dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
*2;
779 dst
.pitchUV
= -dst
.pitchUV
;
782 BYTE
* src_origin
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ rs
.left
;
783 BYTE
*s
= src_origin
;
786 ss
[0] = src_origin
+ src
.pitch
*src
.h
*2;//U
787 ss
[1] = src_origin
+ src
.pitch
*src
.h
*3;//V
790 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
791 // && (reinterpret_cast<intptr_t>(d2)&15)==0 )
792 if( ((reinterpret_cast<intptr_t>(s
) | static_cast<intptr_t>(src
.pitch
) |
793 reinterpret_cast<intptr_t>(d
) | static_cast<intptr_t>(dst
.pitch
) ) & 15 )==0 )
795 for(int i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
798 BYTE
* s2
= s
+ src
.pitch
*src
.h
;
799 BYTE
* s2end_mod16
= s2
+ (w
&~15);
800 BYTE
* s2end
= s2
+ w
;
803 for(; s2
< s2end_mod16
; s2
+=16, sa
+=16, d2
+=16)
806 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(sa
) );
807 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(s2
) );
808 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(d2
) );
809 __m128i lo
= _mm_setzero_si128();
810 lo
= _mm_unpacklo_epi8(lo
, alpha
);
811 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
812 lo
= _mm_setzero_si128();
813 lo
= _mm_unpacklo_epi8(lo
, src_y
);
814 dst_y
= _mm_adds_epu16(dst_y
, lo
);
815 _mm_store_si128( reinterpret_cast<__m128i
*>(d2
), dst_y
);
818 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(d2
) );
819 lo
= _mm_setzero_si128();
820 lo
= _mm_unpackhi_epi8(lo
, alpha
);
821 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
822 lo
= _mm_setzero_si128();
823 lo
= _mm_unpackhi_epi8(lo
, src_y
);
824 dst_y
= _mm_adds_epu16(dst_y
, lo
);
825 _mm_store_si128( reinterpret_cast<__m128i
*>(d2
), dst_y
);
827 for( WORD
* d3
=reinterpret_cast<WORD
*>(d2
); s2
< s2end
; s2
++, sa
++, d3
++)
829 d2
[0] = ((d2
[0]*sa
[0])>>8) + (s2
[0]<<8);
833 else //fix me: only a workaround for non-mod-16 size video
835 for(int i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
838 BYTE
* s2
= s
+ src
.pitch
*src
.h
;
839 BYTE
* s2end_mod16
= s2
+ (w
&~15);
840 BYTE
* s2end
= s2
+ w
;
841 WORD
* d2
= reinterpret_cast<WORD
*>(d
);
842 for(; s2
< s2end
; s2
+=1, sa
+=1, d2
+=1)
846 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
847 d2
[0] = ((d2
[0]*sa
[0])>>8) + (s2
[0]<<8);
853 // // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa2)&15)==0
854 // // && (reinterpret_cast<intptr_t>(d2)&7)==0 )
855 // if( ((reinterpret_cast<intptr_t>(ss[0]) | reinterpret_cast<intptr_t>(ss[1]) |
856 // reinterpret_cast<intptr_t>(ddUV) |
857 // reinterpret_cast<intptr_t>(src_origin) | static_cast<intptr_t>(src.pitch) |
858 // (static_cast<intptr_t>(dst.pitchUV)&7) ) & 15 )==0 )
860 // BYTE* s_u = ss[0];
861 // BYTE* s_v = ss[1];
862 // BYTE* sa = src_origin;
864 // int pitch = src.pitch;
865 // for(int j = 0; j < h2; j++, s_u += src.pitch*2, s_v += src.pitch*2, sa += src.pitch*2, d += dst.pitchUV)
869 // BYTE* s_u2end_mod16 = s_u2 + (w&~15);
870 // BYTE* s_u2end = s_u2 + w;
874 // for(; s_u2 < s_u2end_mod16; s_u2 += 8, s_v2+=8, sa2 += 8, d2+=16)
876 // __m128i dst = _mm_load_si128( reinterpret_cast<const __m128i*>(d2) );
877 // __m128i alpha1 = _mm_load_si128( reinterpret_cast<const __m128i*>(sa2) );
878 // __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(sa2+src.pitch) );
880 // __m128i temp1 = _mm_setzero_si128();
881 // temp1 = _mm_unpacklo_epi8(alpha1, temp1);
882 // __m128i temp2 = _mm_setzero_si128();
883 // temp2 = _mm_unpacklo_epi8(alpha1, temp2);
885 // temp1 = _mm_adds_epu16(temp1, temp2);
887 // temp2 = _mm_srai_epi32(temp1, 16);
888 // temp1 = _mm_adds_epu16(temp1, temp2);
889 // temp1 = _mm_srli_epi32(temp1, 22);
890 // temp2 = _mm_srai_epi32(temp1, 16);
891 // temp1 = _mm_adds_epu16(temp1, temp2);
893 // dst = _mm_mulhi_epu16(dst, temp1);
896 // __m128i su1 = _mm_load_si128( reinterpret_cast<const __m128i*>(s_u2) );
897 // __m128i su2 = _mm_load_si128( reinterpret_cast<const __m128i*>(s_u2+src.pitch) );
898 // __m128i sv1 = _mm_load_si128( reinterpret_cast<const __m128i*>(s_v2) );
899 // __m128i sv2 = _mm_load_si128( reinterpret_cast<const __m128i*>(s_u2+src.pitch) );
902 // su1 = _mm_unpacklo_epi8(su1, zero);
903 // su2 = _mm_unpacklo_epi8(su2, zero);
904 // sv1 = _mm_unpacklo_epi8(sv1, zero);
905 // sv2 = _mm_unpacklo_epi8(sv2, zero);
906 // alpha = _mm_unpacklo_epi8(alpha, zero);
907 // alpha2 = _mm_unpacklo_epi8(alpha2, zero);
909 // su1 = _mm_adds_epu16(su1, su2);
910 // sv1 = _mm_adds_epu16(sv1, sv2);
911 // alpha = _mm_adds_epu16(alpha, alpha2);
913 // su2 = _mm_srli_epi32(su1, 16);
914 // sv2 = _mm_srli_epi32(sv1, 16);
915 // alpha2 = _mm_srli_epi32(alpha, 16);
917 // su1 = _mm_adds_epu16(su1,su2);
918 // sv1 = _mm_adds_epu16(sv1,sv2);
919 // alpha = _mm_adds_epu16(alpha,alpha2);
921 // su1 = _mm_srai_epi32(su1, 16);
922 // sv1 = _mm_srai_epi32(sv1, 16);
923 // sv1 = _mm_srli_epi32(sv1, 16);
925 // su1 = _mm_add_epi32(su1,sv1);
927 // alpha2 = _mm_srai_epi32(alpha, 16);
928 // alpha = _mm_srli_epi32(alpha2, 16);
929 // alpha = _mm_add_epi32(alpha,alpha2);
930 // alpha = _mm_srli_epi16(alpha, 6);
932 // dst = _mm_mulhi_epu16(dst, alpha);
933 // dst = _mm_adds_epu16(dst, su1);
934 // _mm_store_si128( reinterpret_cast<__m128i*>(d2), dst ); */
936 // for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, s_v2+=2, sa2+=2, d3++)
938 // unsigned int ia = ( sa2[0]+ sa2[1]+
939 // sa2[0+src.pitch]+sa2[1+src.pitch]);
940 // *d3 = (((*d3)*ia)>>8) + ((s_u2[0] + s_u2[1]+
941 // s_u2[src.pitch]+s_u2[1+src.pitch] ));
943 // *d3 = (((*d3)*ia)>>8) + ((s_v2[0] + s_v2[1]+
944 // s_v2[src.pitch]+s_v2[1+src.pitch] ));
948 // else//fix me: only a workaround for non-mod-16 size video
952 BYTE
* sa
= src_origin
;
954 int pitch
= src
.pitch
;
955 for(int j
= 0; j
< h2
; j
++, s_u
+= src
.pitch
*2, s_v
+= src
.pitch
*2, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
959 BYTE
* s_u2end_mod16
= s_u2
+ (w
&~15);
960 BYTE
* s_u2end
= s_u2
+ w
;
964 for( WORD
* d3
=reinterpret_cast<WORD
*>(d2
); s_u2
< s_u2end
; s_u2
+=2, s_v2
+=2, sa2
+=2, d3
++)
968 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
]);
969 *d3
= (((*d3
)*ia
)>>10) + ((
971 s_u2
[src
.pitch
]+s_u2
[1+src
.pitch
] )<<6);
973 *d3
= (((*d3
)*ia
)>>10) + ((
975 s_v2
[src
.pitch
]+s_v2
[1+src
.pitch
] )<<6);
992 STDMETHODIMP
CMemSubPic::AlphaBltAyuv_Yv12(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
994 const SubPicDesc
& src
= m_spd
;
995 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
997 CRect
rs(*pSrc
), rd(*pDst
);
1001 rd
.bottom
= dst
.h
- rd
.bottom
;
1002 rd
.top
= dst
.h
- rd
.top
;
1005 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1006 return E_INVALIDARG
;
1009 int w
= rs
.Width(), h
= rs
.Height();
1011 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);
1012 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*rd
.top
+ rd
.left
;
1014 if(rd
.top
> rd
.bottom
) {
1015 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + rd
.left
;
1017 dst
.pitch
= -dst
.pitch
;
1020 for(ptrdiff_t j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
) {
1022 BYTE
* s2end
= s2
+ w
*4;
1024 for(; s2
< s2end
; s2
+= 4, d2
++) {
1026 d2
[0] = ((d2
[0]*s2
[3])>>8) + s2
[1];
1030 dst
.pitch
= abs(dst
.pitch
);
1035 dst
.pitchUV
= dst
.pitch
/2;
1039 ss
[0] = (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ rs
.left
*4;
1042 if(!dst
.bitsU
|| !dst
.bitsV
) {
1043 dst
.bitsU
= (BYTE
*)dst
.bits
+ dst
.pitch
*dst
.h
;
1044 dst
.bitsV
= dst
.bitsU
+ dst
.pitchUV
*dst
.h
/2;
1046 if(dst
.type
== MSP_YV12
) {
1047 BYTE
* p
= dst
.bitsU
;
1048 dst
.bitsU
= dst
.bitsV
;
1054 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1055 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1057 if(rd
.top
> rd
.bottom
) {
1058 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1059 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1060 dst
.pitchUV
= -dst
.pitchUV
;
1063 for(ptrdiff_t i
= 0; i
< 2; i
++) {
1067 for(ptrdiff_t j
= 0; j
< h2
; j
++, s
+= src
.pitch
*2, d
+= dst
.pitchUV
, is
+= src
.pitch
*2) {
1069 BYTE
* s2end
= s2
+ w
*4;
1072 for(; s2
< s2end
; s2
+= 8, d2
++, is2
+= 8) {
1073 unsigned int ia
= (s2
[3]+s2
[3+src
.pitch
]+is2
[3]+is2
[3+src
.pitch
])>>2;
1075 *d2
= ((*d2
*ia
)>>8) + ((s2
[0]+s2
[src
.pitch
])>>1);
1084 STDMETHODIMP
CMemSubPic::SetDirtyRectEx(CAtlList
<CRect
>* dirtyRectList
)
1086 //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1087 if(dirtyRectList
!=NULL
)
1089 POSITION pos
= dirtyRectList
->GetHeadPosition();
1090 if(m_spd
.type
== MSP_AY11
|| m_alpha_blt_dst_type
==MSP_IYUV
|| m_alpha_blt_dst_type
==MSP_YV12
1091 || m_alpha_blt_dst_type
==MSP_P010
|| m_alpha_blt_dst_type
==MSP_P016
)
1095 CRect
& cRectSrc
= dirtyRectList
->GetNext(pos
);
1096 cRectSrc
.left
&= ~15;
1097 cRectSrc
.right
= (cRectSrc
.right
+15)&~15;
1099 cRectSrc
.bottom
= (cRectSrc
.bottom
+1)&~1;
1102 else if(m_spd
.type
== MSP_AUYV
|| m_alpha_blt_dst_type
==MSP_YUY2
)
1106 CRect
& cRectSrc
= dirtyRectList
->GetNext(pos
);
1107 cRectSrc
.left
&= ~3;
1108 cRectSrc
.right
= (cRectSrc
.right
+3)&~3;
1112 return __super::SetDirtyRectEx(dirtyRectList
);
1116 // CMemSubPicAllocator
1119 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type
, SIZE maxsize
, int type
/*=-1*/)
1120 : CSubPicExAllocatorImpl(maxsize
, false, false)
1121 , m_alpha_blt_dst_type(alpha_blt_dst_type
)
1122 , m_maxsize(maxsize
)
1127 switch(alpha_blt_dst_type
)
1146 // ISubPicAllocatorImpl
1148 bool CMemSubPicAllocator::AllocEx(bool fStatic
, ISubPicEx
** ppSubPic
)
1154 spd
.w
= m_maxsize
.cx
;
1155 spd
.h
= m_maxsize
.cy
;
1157 spd
.pitch
= (spd
.w
*spd
.bpp
)>>3;
1159 spd
.bits
= DNew BYTE
[spd
.pitch
*spd
.h
];
1163 *ppSubPic
= DNew
CMemSubPic(spd
, m_alpha_blt_dst_type
);
1167 (*ppSubPic
)->AddRef();