Minor fix.
[xy_vsfilter.git] / src / subpic / MemSubPic.cpp
blobc5b72ab3e5093f98dc22533a1ece527af8d6b716
1 /*
2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
22 #include "stdafx.h"
23 #include "MemSubPic.h"
24 #include "color_conv_table.h"
26 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
27 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
28 m128_2 = _mm_slli_epi16(m128_1, 8); \
29 m128_1 = _mm_srli_epi16(m128_1, 8); \
30 m128_2 = _mm_srli_epi16(m128_2, 8); \
31 m128_1 = _mm_avg_epu8(m128_1, m128_2);
33 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
35 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
36 m128_2 = _mm_slli_epi16(m128_1, 8); \
37 __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
38 m128_2 = _mm_or_si128(m128_2, m128_3);\
39 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
42 void subsample_and_interlace_2_line_c(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
44 const BYTE* end = u + w;
45 for (;u<end;dst+=2,u+=2,v+=2)
47 dst[0] = (u[0] + u[0+pitch] + 1)/2;
48 int tmp1 = (u[1] + u[1+pitch] + 1)/2;
49 dst[0] = (dst[0] + tmp1 + 1)/2;
50 dst[1] = (v[0] + v[0+pitch] + 1)/2;
51 tmp1 = (v[1] + v[1+pitch] + 1)/2;
52 dst[1] = (dst[1] + tmp1 + 1)/2;
56 __forceinline void subsample_and_interlace_2_line_sse2(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
58 const BYTE* end = u + w;
59 for (;u<end;dst+=16,u+=16,v+=16)
61 __m128i u_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(u) );
62 __m128i u_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(u+pitch) );
63 __m128i v_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(v) );
64 __m128i v_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(v+pitch) );
65 AVERAGE_4_PIX_INTRINSICS(u_1, u_2);
66 AVERAGE_4_PIX_INTRINSICS(v_1, v_2);
67 u_1 = _mm_packus_epi16(u_1, u_1);
68 v_1 = _mm_packus_epi16(v_1, v_1);
69 u_1 = _mm_unpacklo_epi8(u_1, v_1);
71 _mm_store_si128( reinterpret_cast<__m128i*>(dst), u_1 );
75 static __forceinline void pix_alpha_blend_yv12_luma_sse2(byte* dst, const byte* alpha, const byte* sub)
77 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
78 __m128i alpha128 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
79 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(sub) );
80 __m128i zero = _mm_setzero_si128();
82 __m128i ones;
83 #ifdef _DEBUG
84 ones = _mm_setzero_si128();//disable warning C4700
85 #endif
86 ones = _mm_cmpeq_epi32(ones,ones);
87 ones = _mm_cmpeq_epi8(ones,alpha128);
89 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
90 __m128i alpha_lo128 = _mm_unpacklo_epi8(alpha128, zero);
92 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
94 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha_lo128);
95 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
96 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
98 dst128 = _mm_unpackhi_epi8(dst128, zero);
99 alpha128 = _mm_unpackhi_epi8(alpha128, zero);
101 ones2 = _mm_unpackhi_epi8(ones, zero);
103 dst128 = _mm_mullo_epi16(dst128, alpha128);
104 dst128 = _mm_adds_epu16(dst128, ones2);
105 dst128 = _mm_srli_epi16(dst128, 8);
106 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
108 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
109 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
112 /***
113 * output not exactly identical to pix_alpha_blend_yv12_chroma
115 static __forceinline void pix_alpha_blend_yv12_chroma_sse2(byte* dst, const byte* src, const byte* alpha, int src_pitch)
117 __m128i zero = _mm_setzero_si128();
118 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
119 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha+src_pitch) );
120 __m128i dst128 = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(dst) );
122 __m128i sub128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
123 __m128i sub128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src+src_pitch) );
125 AVERAGE_4_PIX_INTRINSICS(alpha128_1, alpha128_2);
127 __m128i ones;
128 #ifdef _DEBUG
129 ones = _mm_setzero_si128();//disable warning C4700
130 #endif
131 ones = _mm_cmpeq_epi32(ones,ones);
132 ones = _mm_cmpeq_epi8(ones, alpha128_1);
134 dst128 = _mm_unpacklo_epi8(dst128, zero);
135 __m128i dst128_2 = _mm_and_si128(dst128, ones);
137 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
138 dst128 = _mm_adds_epu16(dst128, dst128_2);
140 dst128 = _mm_srli_epi16(dst128, 8);
142 AVERAGE_4_PIX_INTRINSICS(sub128_1, sub128_2);
144 dst128 = _mm_adds_epi16(dst128, sub128_1);
145 dst128 = _mm_packus_epi16(dst128, dst128);
147 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst), dst128 );
150 static __forceinline void pix_alpha_blend_yv12_chroma(byte* dst, const byte* src, const byte* alpha, int src_pitch)
152 unsigned int ia = (alpha[0]+alpha[1]+
153 alpha[0+src_pitch]+alpha[1+src_pitch])>>2;
154 if(ia!=0xff)
156 *dst= (((*dst)*ia)>>8) + ((src[0] +src[1]+
157 src[src_pitch]+src[1+src_pitch] )>>2);
161 static void AlphaBltYv12Luma(byte* dst, int dst_pitch,
162 int w, int h,
163 const byte* sub, const byte* alpha, int sub_pitch)
165 if( ((reinterpret_cast<intptr_t>(alpha) | static_cast<intptr_t>(sub_pitch) |
166 reinterpret_cast<intptr_t>(dst) | static_cast<intptr_t>(dst_pitch) ) & 15 )==0 )
168 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
170 const BYTE* sa = alpha;
171 const BYTE* s2 = sub;
172 const BYTE* s2end_mod16 = s2 + (w&~15);
173 const BYTE* s2end = s2 + w;
174 BYTE* d2 = dst;
176 for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=16)
178 pix_alpha_blend_yv12_luma_sse2(d2, sa, s2);
180 for(; s2 < s2end; s2++, sa++, d2++)
182 if(sa[0] < 0xff)
184 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
189 else //fix me: only a workaround for non-mod-16 size video
191 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
193 const BYTE* sa = alpha;
194 const BYTE* s2 = sub;
195 const BYTE* s2end_mod16 = s2 + (w&~15);
196 const BYTE* s2end = s2 + w;
197 BYTE* d2 = dst;
198 for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
200 if(sa[0] < 0xff)
202 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
203 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
210 static void AlphaBltYv12Chroma(byte* dst, int dst_pitch,
211 int w, int chroma_h,
212 const byte* sub_chroma, const byte* alpha, int sub_pitch)
214 if( ((reinterpret_cast<intptr_t>(sub_chroma) |
215 //reinterpret_cast<intptr_t>(dst) |
216 reinterpret_cast<intptr_t>(alpha) | static_cast<intptr_t>(sub_pitch)
217 //| (static_cast<intptr_t>(dst_pitch)&7)
218 ) & 15 )==0 )
220 int pitch = sub_pitch;
221 for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
223 const BYTE* s2 = sub_chroma;
224 const BYTE* sa2 = alpha;
225 const BYTE* s2end_mod16 = s2 + (w&~15);
226 const BYTE* s2end = s2 + w;
227 BYTE* d2 = dst;
229 for(; s2 < s2end_mod16; s2 += 16, sa2 += 16, d2+=8)
231 pix_alpha_blend_yv12_chroma_sse2(d2, s2, sa2, sub_pitch);
233 for(; s2 < s2end; s2+=2, sa2+=2, d2++)
235 pix_alpha_blend_yv12_chroma(d2, s2, sa2, sub_pitch);
239 else//fix me: only a workaround for non-mod-16 size video
241 for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
243 const BYTE* s2 = sub_chroma;
244 const BYTE* sa2 = alpha;
245 const BYTE* s2end_mod16 = s2 + (w&~15);
246 const BYTE* s2end = s2 + w;
247 BYTE* d2 = dst;
248 for(; s2 < s2end; s2 += 2, sa2 += 2, d2++)
250 pix_alpha_blend_yv12_chroma(d2, s2, sa2, sub_pitch);
256 __forceinline void mix_16_y_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
258 //important!
259 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
260 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
261 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
263 __m128i alpha_ff;
264 #ifdef _DEBUG
265 alpha_ff = _mm_setzero_si128();//disable warning C4700
266 #endif
267 alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
269 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
271 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
272 //so we do it another way
273 //first, (alpha<<8)+0xff
274 __m128i ones = _mm_setzero_si128();
275 ones = _mm_cmpeq_epi16(dst_y, ones);
277 __m128i ones2;
278 #ifdef _DEBUG
279 ones2 = _mm_setzero_si128();//disable warning C4700
280 #endif
281 ones2 = _mm_cmpeq_epi32(ones2,ones2);
283 ones = _mm_xor_si128(ones, ones2);
284 ones = _mm_srli_epi16(ones, 15);
285 ones = _mm_and_si128(ones, lo);
287 dst_y = _mm_mulhi_epu16(dst_y, lo);
288 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
290 lo = _mm_setzero_si128();
291 lo = _mm_unpacklo_epi8(lo, src_y);
292 dst_y = _mm_adds_epu16(dst_y, lo);
293 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
295 dst += 16;
296 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
298 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
300 ones = _mm_setzero_si128();
301 ones = _mm_cmpeq_epi16(dst_y, ones);
302 ones = _mm_xor_si128(ones, ones2);
303 ones = _mm_srli_epi16(ones, 15);
304 ones = _mm_and_si128(ones, lo);
306 dst_y = _mm_mulhi_epu16(dst_y, lo);
307 dst_y = _mm_adds_epu16(dst_y, ones);
309 lo = _mm_setzero_si128();
310 lo = _mm_unpackhi_epi8(lo, src_y);
311 dst_y = _mm_adds_epu16(dst_y, lo);
312 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
315 //for test only
316 void mix_16_y_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
318 WORD* dst_word = reinterpret_cast<WORD*>(dst);
319 for (int i=0;i<16;i++)
321 if (src_alpha[i]!=0xff)
323 dst_word[i] = ((dst_word[i] *src_alpha[i])>>8) + (src[i]<<8);
328 __forceinline void mix_16_uv_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
330 //important!
331 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
332 __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
334 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
335 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
337 AVERAGE_4_PIX_INTRINSICS_2(alpha, alpha2);
339 __m128i alpha_ff;
340 #ifdef _DEBUG
341 alpha_ff = _mm_setzero_si128();//disable warning C4700
342 #endif
343 alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
345 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
347 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
348 //so we do it another way
349 //first, (alpha<<8)+0xff
350 __m128i ones = _mm_setzero_si128();
351 ones = _mm_cmpeq_epi16(dst_y, ones);
353 __m128i ones2;
354 #ifdef _DEBUG
355 ones2 = _mm_setzero_si128();//disable warning C4700
356 #endif
357 ones2 = _mm_cmpeq_epi32(ones2,ones2);
358 ones = _mm_xor_si128(ones, ones2);
359 ones = _mm_srli_epi16(ones, 15);
360 ones = _mm_and_si128(ones, lo);
362 dst_y = _mm_mulhi_epu16(dst_y, lo);
363 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
365 lo = _mm_setzero_si128();
366 lo = _mm_unpacklo_epi8(lo, src_y);
367 dst_y = _mm_adds_epu16(dst_y, lo);
368 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
370 dst += 16;
371 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
373 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
375 ones = _mm_setzero_si128();
376 ones = _mm_cmpeq_epi16(dst_y, ones);
377 ones = _mm_xor_si128(ones, ones2);
378 ones = _mm_srli_epi16(ones, 15);
379 ones = _mm_and_si128(ones, lo);
381 dst_y = _mm_mulhi_epu16(dst_y, lo);
382 dst_y = _mm_adds_epu16(dst_y, ones);
384 lo = _mm_setzero_si128();
385 lo = _mm_unpackhi_epi8(lo, src_y);
386 dst_y = _mm_adds_epu16(dst_y, lo);
387 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
390 //for test only
391 void mix_16_uv_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
393 WORD* dst_word = reinterpret_cast<WORD*>(dst);
394 for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst_word+=2)
396 unsigned int ia = (
397 (src_alpha[0]+src_alpha[0+pitch]+1)/2+
398 (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
399 if( ia!=0xFF )
401 int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);
402 if(tmp>0xffff) tmp = 0xffff;
403 dst_word[0] = tmp;
404 tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);
405 if(tmp>0xffff) tmp = 0xffff;
406 dst_word[1] = tmp;
411 __forceinline void mix_16_uv_nvxx_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
413 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
414 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
415 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
416 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
418 AVERAGE_4_PIX_INTRINSICS_2(alpha128_1, alpha128_2);
419 __m128i zero = _mm_setzero_si128();
421 __m128i ones;
422 #ifdef _DEBUG
423 ones = _mm_setzero_si128();//disable warning C4700
424 #endif
425 ones = _mm_cmpeq_epi32(ones,ones);
426 ones = _mm_cmpeq_epi8(ones,alpha128_1);
428 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
429 alpha128_2 = _mm_unpacklo_epi8(alpha128_1, zero);
431 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
433 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha128_2);
434 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
435 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
437 dst128 = _mm_unpackhi_epi8(dst128, zero);
438 alpha128_1 = _mm_unpackhi_epi8(alpha128_1, zero);
440 ones2 = _mm_unpackhi_epi8(ones, zero);
442 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
443 dst128 = _mm_adds_epu16(dst128, ones2);
444 dst128 = _mm_srli_epi16(dst128, 8);
445 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
447 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
448 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
451 //for test only
452 void mix_16_uv_nvxx_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
454 for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst+=2)
456 unsigned int ia = (
457 (src_alpha[0]+src_alpha[0+pitch]+1)/2+
458 (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
459 if( ia!=0xFF )
461 dst[0] = (((dst[0])*ia)>>8) + src[0];
462 dst[1] = (((dst[1])*ia)>>8) + src[1];
468 // CMemSubPic
471 CMemSubPic::CMemSubPic(SubPicDesc& spd, int alpha_blt_dst_type)
472 : m_spd(spd), m_alpha_blt_dst_type(alpha_blt_dst_type)
474 m_maxsize.SetSize(spd.w, spd.h);
475 // m_rcDirty.SetRect(0, 0, spd.w, spd.h);
476 CRect allSpd(0,0,spd.w, spd.h);
477 m_rectListDirty.AddTail(allSpd);
480 CMemSubPic::~CMemSubPic()
482 delete [] m_spd.bits, m_spd.bits = NULL;
485 // ISubPic
487 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
489 return (void*)&m_spd;
492 STDMETHODIMP CMemSubPic::GetDesc(SubPicDesc& spd) const
494 spd.type = m_spd.type;
495 spd.w = m_size.cx;
496 spd.h = m_size.cy;
497 spd.bpp = m_spd.bpp;
498 spd.pitch = m_spd.pitch;
499 spd.bits = m_spd.bits;
500 spd.bitsU = m_spd.bitsU;
501 spd.bitsV = m_spd.bitsV;
502 spd.vidrect = m_vidrect;
503 return S_OK;
506 STDMETHODIMP CMemSubPic::CopyTo(ISubPicEx* pSubPic)
508 HRESULT hr;
509 if(FAILED(hr = __super::CopyTo(pSubPic))) {
510 return hr;
513 SubPicDesc src, dst;
514 if(FAILED(GetDesc(src)) || FAILED(pSubPic->GetDesc(dst))) {
515 return E_FAIL;
517 while(!m_rectListDirty.IsEmpty())
519 CRect& cRect = m_rectListDirty.GetHead();
520 int w = cRect.Width(), h = cRect.Height();
521 BYTE* s = (BYTE*)src.bits + src.pitch*cRect.top + cRect.left*4;
522 BYTE* d = (BYTE*)dst.bits + dst.pitch*cRect.top + cRect.left*4;
523 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
524 memcpy(d, s, w*4);
526 return S_OK;
529 STDMETHODIMP CMemSubPic::ClearDirtyRect(DWORD color)
531 if(m_rectListDirty.IsEmpty()) {
532 return S_OK;
534 while(!m_rectListDirty.IsEmpty())
536 //pDirtyRect = m_rectListDirty.RemoveHead();
537 CRect& dirtyRect = m_rectListDirty.RemoveTail();
538 BYTE* p = (BYTE*)m_spd.bits + m_spd.pitch*(dirtyRect.top) + dirtyRect.left*(m_spd.bpp>>3);
539 int w = dirtyRect.Width();
540 if(m_spd.type!=MSP_AYUV_PLANAR)
542 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
544 #ifdef _WIN64
545 memsetd(p, color, w*4); // nya
546 #else
547 __asm
549 mov eax, color
550 mov ecx, w
551 mov edi, p
553 rep stosd
556 #endif
559 else
561 ///TODO:
562 ///FIX ME
563 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
565 // memsetd(p, 0, m_rcDirty.Width());
566 //DbgLog((LOG_TRACE, 3, "w:%d", w));
567 //w = pDirtyRect->Width();
568 memset(p, 0xFF, w);
569 memset(p+m_spd.h*m_spd.pitch, 0, w);
570 memset(p+m_spd.h*m_spd.pitch*2, 0, w);
571 memset(p+m_spd.h*m_spd.pitch*3, 0, w);
575 m_rectListDirty.RemoveAll();
576 return S_OK;
579 STDMETHODIMP CMemSubPic::Lock(SubPicDesc& spd)
581 return GetDesc(spd);
584 STDMETHODIMP CMemSubPic::Unlock( CAtlList<CRect>* dirtyRectList )
586 int src_type = m_spd.type;
587 int dst_type = m_alpha_blt_dst_type;
588 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
589 dst_type == MSP_RGB24 ||
590 dst_type == MSP_RGB16 ||
591 dst_type == MSP_RGB15))
593 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
595 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
597 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
598 dst_type == MSP_YV12 ||
599 dst_type == MSP_P010 ||
600 dst_type == MSP_P016 ||
601 dst_type == MSP_NV12 ||
602 dst_type == MSP_NV21)))
604 return UnlockOther(dirtyRectList);
606 else if(src_type==MSP_RGBA && (dst_type == MSP_YUY2 ||
607 dst_type == MSP_AYUV || //ToDo: fix me MSP_AYUV
608 dst_type == MSP_IYUV ||
609 dst_type == MSP_YV12 ||
610 dst_type == MSP_NV12 ||
611 dst_type == MSP_NV21 ||
612 dst_type == MSP_P010 ||
613 dst_type == MSP_P016))
615 return UnlockRGBA_YUV(dirtyRectList);
617 return E_NOTIMPL;
620 STDMETHODIMP CMemSubPic::UnlockOther(CAtlList<CRect>* dirtyRectList)
622 SetDirtyRectEx(dirtyRectList);
623 if(m_rectListDirty.IsEmpty()) {
624 return S_OK;
627 POSITION pos = m_rectListDirty.GetHeadPosition();
628 while(pos!=NULL)
630 const CRect& cRect = m_rectListDirty.GetNext(pos);
631 int w = cRect.Width(), h = cRect.Height();
632 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*(cRect.top) + cRect.left*4;
633 BYTE* bottom = top + m_spd.pitch*h;
634 if(m_alpha_blt_dst_type == MSP_RGB16)
636 for(; top < bottom ; top += m_spd.pitch)
638 DWORD* s = (DWORD*)top;
639 DWORD* e = s + w;
640 for(; s < e; s++)
642 *s = ((*s>>3)&0x1f000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
643 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
647 else if(m_alpha_blt_dst_type == MSP_RGB15)
649 for(; top < bottom; top += m_spd.pitch)
651 DWORD* s = (DWORD*)top;
652 DWORD* e = s + w;
653 for(; s < e; s++)
655 *s = ((*s>>3)&0x1f000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
656 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
660 else if(m_alpha_blt_dst_type == MSP_YUY2)
662 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top, m_spd.pitch*(h-1)) );
664 for(BYTE* tempTop=top; tempTop < bottom ; tempTop += m_spd.pitch)
666 BYTE* s = tempTop;
667 BYTE* e = s + w*4;
668 for(; s < e; s+=8) // AUYV AUYV -> AxYU AxYV
670 s[4] = (s[0] + s[4])>>1;
671 s[0] = (s[2] + s[6])>>1;
675 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top, m_spd.pitch*(h-1)) );
677 else if(m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV )
679 //nothing to do
681 else if ( m_alpha_blt_dst_type == MSP_P010 || m_alpha_blt_dst_type == MSP_P016
682 || m_alpha_blt_dst_type == MSP_NV12 )
684 SubsampleAndInterlace(cRect, true);
686 else if( m_alpha_blt_dst_type == MSP_NV21 )
688 SubsampleAndInterlace(cRect, false);
691 return S_OK;
694 STDMETHODIMP CMemSubPic::UnlockRGBA_YUV(CAtlList<CRect>* dirtyRectList)
696 SetDirtyRectEx(dirtyRectList);
697 if(m_rectListDirty.IsEmpty()) {
698 return S_OK;
701 const ColorConvTable *conv_table = ColorConvTable::GetDefaultColorConvTable();
702 const int *c2y_yb = conv_table->c2y_yb;
703 const int *c2y_yg = conv_table->c2y_yg;
704 const int *c2y_yr = conv_table->c2y_yr;
705 const int cy_cy2 = conv_table->cy_cy2;
706 const int c2y_cu = conv_table->c2y_cu;
707 const int c2y_cv = conv_table->c2y_cv;
708 const int cy_cy = conv_table->cy_cy;
709 const unsigned char* Clip = conv_table->Clip;
711 POSITION pos = m_rectListDirty.GetHeadPosition();
712 while(pos!=NULL)
714 const CRect& cRect = m_rectListDirty.GetNext(pos);
715 int w = cRect.Width(), h = cRect.Height();
717 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*cRect.top + cRect.left*4;
718 BYTE* bottom = top + m_spd.pitch*h;
720 if( m_alpha_blt_dst_type == MSP_YUY2 ||
721 m_alpha_blt_dst_type == MSP_YV12 ||
722 m_alpha_blt_dst_type == MSP_IYUV ||
723 m_alpha_blt_dst_type == MSP_P010 ||
724 m_alpha_blt_dst_type == MSP_P016 ||
725 m_alpha_blt_dst_type == MSP_NV12 ||
726 m_alpha_blt_dst_type == MSP_NV21) {
727 for(; top < bottom ; top += m_spd.pitch) {
728 BYTE* s = top;
729 BYTE* e = s + w*4;
730 for(; s < e; s+=8) { // ARGB ARGB -> AxYU AxYV
731 if((s[3]+s[7]) < 0x1fe) {
732 int a = 0x200 - (s[3]+s[7]);
733 a <<= 7;
734 // 0 <= a <= 0x10000
735 s[1] = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a + 0x8000) >> 16;
736 s[5] = (c2y_yb[s[4]] + c2y_yg[s[5]] + c2y_yr[s[6]] + 0x10*a + 0x8000) >> 16;
738 int scaled_y = (s[1]+s[5]-32) * cy_cy2;
740 s[0] = Clip[(((((s[0]+s[4])<<15) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
741 s[4] = Clip[(((((s[2]+s[6])<<15) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
742 } else {
743 s[1] = s[5] = 0;
744 s[0] = s[4] = 0;
749 else if(m_alpha_blt_dst_type == MSP_AYUV) {
750 for(; top < bottom ; top += m_spd.pitch) {
751 BYTE* s = top;
752 BYTE* e = s + w*4;
753 for(; s < e; s+=4) { // ARGB -> AYUV
754 if(s[3] < 0xff) {
755 int a = 0x100 - s[3];
756 a <<= 8;
757 // 0 <= a <= 0x10000
759 int y = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a + 0x8000) >> 16;
760 int scaled_y = (y-32) * cy_cy;
761 s[1] = Clip[((((s[0]<<16) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
762 s[0] = Clip[((((s[2]<<16) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
763 s[2] = y;
764 } else {
765 s[0] = s[1] = 0;
766 s[2] = 0;
772 return S_OK;
775 void CMemSubPic::SubsampleAndInterlace( const CRect& cRect, bool u_first )
777 //fix me: check alignment and log error
778 int w = cRect.Width(), h = cRect.Height();
779 BYTE* u_plan = reinterpret_cast<BYTE*>(m_spd.bits) + m_spd.pitch*m_spd.h*2;
780 BYTE* u_start = u_plan + m_spd.pitch*(cRect.top)+ cRect.left;
781 BYTE* v_start = u_start + m_spd.pitch*m_spd.h;
782 BYTE* dst = u_start;
783 if(!u_first)
785 BYTE* tmp = v_start;
786 v_start = u_start;
787 u_start = tmp;
790 //Todo: fix me.
791 //Walkarround for alignment
792 if ( (m_spd.pitch&15) == 0 )
794 for (int i=0;i<h;i+=2)
796 subsample_and_interlace_2_line_sse2(dst, u_start, v_start, w, m_spd.pitch);
797 u_start += 2*m_spd.pitch;
798 v_start += 2*m_spd.pitch;
799 dst += m_spd.pitch;
802 else
804 for (int i=0;i<h;i+=2)
806 subsample_and_interlace_2_line_c(dst, u_start, v_start, w, m_spd.pitch);
807 u_start += 2*m_spd.pitch;
808 v_start += 2*m_spd.pitch;
809 dst += m_spd.pitch;
814 STDMETHODIMP CMemSubPic::AlphaBlt( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
816 if(!pSrc || !pDst || !pTarget) {
817 return E_POINTER;
819 int src_type = m_spd.type;
820 int dst_type = pTarget->type;
822 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
823 dst_type == MSP_RGB24 ||
824 dst_type == MSP_RGB16 ||
825 dst_type == MSP_RGB15 ||
826 dst_type == MSP_RGBA ||
827 dst_type == MSP_YUY2 ||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
828 dst_type == MSP_AYUV ))
830 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
832 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
834 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
835 dst_type == MSP_YV12)) )
837 return AlphaBltOther(pSrc, pDst, pTarget);
839 else if ( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_NV12 ||
840 dst_type == MSP_NV21 ) )
842 return AlphaBltAnv12_Nvxx(pSrc, pDst, pTarget);
845 else if( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_P010 ||
846 dst_type == MSP_P016 ) )
848 return AlphaBltAnv12_P010(pSrc, pDst, pTarget);
850 else if( src_type==MSP_RGBA && (dst_type == MSP_IYUV ||
851 dst_type == MSP_YV12))
853 return AlphaBltAxyuAxyv_Yv12(pSrc, pDst, pTarget);
855 else if( src_type==MSP_RGBA && (dst_type == MSP_NV12||
856 dst_type == MSP_NV21))
858 return AlphaBltAxyuAxyv_Nv12(pSrc, pDst, pTarget);
860 else if( src_type==MSP_RGBA && (dst_type == MSP_P010 ||
861 dst_type == MSP_P016))
863 return AlphaBltAxyuAxyv_P010(pSrc, pDst, pTarget);
865 return E_NOTIMPL;
868 STDMETHODIMP CMemSubPic::AlphaBltOther(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
870 const SubPicDesc& src = m_spd;
871 SubPicDesc dst = *pTarget; // copy, because we might modify it
873 CRect rs(*pSrc), rd(*pDst);
874 if(dst.h < 0)
876 dst.h = -dst.h;
877 rd.bottom = dst.h - rd.bottom;
878 rd.top = dst.h - rd.top;
880 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
881 return E_INVALIDARG;
883 int w = rs.Width(), h = rs.Height();
884 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);//rs.left*4
885 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + ((rd.left*dst.bpp)>>3);
886 if(rd.top > rd.bottom)
888 if(dst.type == MSP_RGB32 || dst.type == MSP_RGB24
889 || dst.type == MSP_RGB16 || dst.type == MSP_RGB15
890 || dst.type == MSP_YUY2 || dst.type == MSP_AYUV)
892 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*dst.bpp>>3);
894 else if(dst.type == MSP_YV12 || dst.type == MSP_IYUV)
896 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*8>>3);
898 else
900 return E_NOTIMPL;
902 dst.pitch = -dst.pitch;
904 DbgLog((LOG_TRACE, 5, TEXT("w=%d h=%d"), w, h));
905 switch(dst.type)
907 case MSP_RGBA:
908 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
910 BYTE* s2 = s;
911 BYTE* s2end = s2 + w*4;
912 DWORD* d2 = (DWORD*)d;
913 for(; s2 < s2end; s2 += 4, d2++)
915 if(s2[3] < 0xff)
917 DWORD bd =0x00000100 -( (DWORD) s2[3]);
918 DWORD B = ((*((DWORD*)s2)&0x000000ff)<<8)/bd;
919 DWORD V = ((*((DWORD*)s2)&0x0000ff00)/bd)<<8;
920 DWORD R = (((*((DWORD*)s2)&0x00ff0000)>>8)/bd)<<16;
921 *d2 = B | V | R
922 | (0xff000000-(*((DWORD*)s2)&0xff000000))&0xff000000;
926 break;
927 case MSP_RGB32:
928 case MSP_AYUV: //ToDo: fix me MSP_VUYA indeed?
929 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
931 BYTE* s2 = s;
932 BYTE* s2end = s2 + w*4;
933 DWORD* d2 = (DWORD*)d;
934 for(; s2 < s2end; s2 += 4, d2++)
936 #ifdef _WIN64
937 DWORD ia = 256-s2[3];
938 if(s2[3] < 0xff) {
939 *d2 = ((((*d2&0x00ff00ff)*s2[3])>>8) + (((*((DWORD*)s2)&0x00ff00ff)*ia)>>8)&0x00ff00ff)
940 | ((((*d2&0x0000ff00)*s2[3])>>8) + (((*((DWORD*)s2)&0x0000ff00)*ia)>>8)&0x0000ff00);
942 #else
943 if(s2[3] < 0xff)
945 *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
946 | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00);
948 #endif
951 break;
952 case MSP_RGB24:
953 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
955 BYTE* s2 = s;
956 BYTE* s2end = s2 + w*4;
957 BYTE* d2 = d;
958 for(; s2 < s2end; s2 += 4, d2 += 3)
960 if(s2[3] < 0xff)
962 d2[0] = ((d2[0]*s2[3])>>8) + s2[0];
963 d2[1] = ((d2[1]*s2[3])>>8) + s2[1];
964 d2[2] = ((d2[2]*s2[3])>>8) + s2[2];
968 break;
969 case MSP_RGB16:
970 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
972 BYTE* s2 = s;
973 BYTE* s2end = s2 + w*4;
974 WORD* d2 = (WORD*)d;
975 for(; s2 < s2end; s2 += 4, d2++)
977 if(s2[3] < 0x1f)
979 *d2 = (WORD)((((((*d2&0xf81f)*s2[3])>>5) + (*(DWORD*)s2&0xf81f))&0xf81f)
980 | (((((*d2&0x07e0)*s2[3])>>5) + (*(DWORD*)s2&0x07e0))&0x07e0));
981 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
982 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
983 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
988 break;
989 case MSP_RGB15:
990 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
992 BYTE* s2 = s;
993 BYTE* s2end = s2 + w*4;
994 WORD* d2 = (WORD*)d;
995 for(; s2 < s2end; s2 += 4, d2++)
997 if(s2[3] < 0x1f)
999 *d2 = (WORD)((((((*d2&0x7c1f)*s2[3])>>5) + (*(DWORD*)s2&0x7c1f))&0x7c1f)
1000 | (((((*d2&0x03e0)*s2[3])>>5) + (*(DWORD*)s2&0x03e0))&0x03e0));
1001 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
1002 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
1003 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
1008 break;
1009 case MSP_YUY2:
1010 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
1012 unsigned int ia, c;
1013 BYTE* s2 = s;
1014 BYTE* s2end = s2 + w*4;
1015 DWORD* d2 = (DWORD*)d;
1016 for(; s2 < s2end; s2 += 8, d2++)
1018 ia = (s2[3]+s2[7])>>1;
1019 if(ia < 0xff)
1021 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
1022 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
1023 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
1024 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
1025 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
1027 ia = (ia<<24)|(s2[7]<<16)|(ia<<8)|s2[3];
1028 c = (s2[4]<<24)|(s2[5]<<16)|(s2[0]<<8)|s2[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
1029 __asm
1031 mov edi, d2
1032 pxor mm0, mm0
1033 movd mm2, c
1034 punpcklbw mm2, mm0
1035 movd mm3, [edi]
1036 punpcklbw mm3, mm0
1037 movd mm4, ia
1038 punpcklbw mm4, mm0
1039 psraw mm4, 1 //or else, overflow because psraw shift in sign bit
1040 pmullw mm3, mm4
1041 psraw mm3, 7
1042 paddsw mm3, mm2
1043 packuswb mm3, mm3
1044 movd [edi], mm3
1049 __asm emms;
1050 break;
1051 case MSP_YV12:
1052 case MSP_IYUV:
1054 //dst.pitch = abs(dst.pitch);
1055 int h2 = h/2;
1056 if(!dst.pitchUV)
1058 dst.pitchUV = abs(dst.pitch)/2;
1060 if(!dst.bitsU || !dst.bitsV)
1062 dst.bitsU = (BYTE*)dst.bits + abs(dst.pitch)*dst.h;
1063 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
1064 if(dst.type == MSP_YV12)
1066 BYTE* p = dst.bitsU;
1067 dst.bitsU = dst.bitsV;
1068 dst.bitsV = p;
1071 BYTE* dd[2];
1072 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
1073 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
1074 if(rd.top > rd.bottom)
1076 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1077 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1078 dst.pitchUV = -dst.pitchUV;
1081 BYTE* src_origin= (BYTE*)src.bits + src.pitch*rs.top + rs.left;
1083 BYTE* ss[2];
1084 ss[0] = src_origin + src.pitch*src.h*2;//U
1085 ss[1] = src_origin + src.pitch*src.h*3;//V
1087 AlphaBltYv12Luma( d, dst.pitch, w, h, src_origin + src.pitch*src.h, src_origin, src.pitch );
1089 AlphaBltYv12Chroma( dd[0], dst.pitchUV, w, h2, ss[0], src_origin, src.pitch);
1090 AlphaBltYv12Chroma( dd[1], dst.pitchUV, w, h2, ss[1], src_origin, src.pitch);
1092 __asm emms;
1094 break;
1095 default:
1096 return E_NOTIMPL;
1097 break;
1100 //emmsÒª40¸öcpuÖÜÆÚ
1101 //__asm emms;
1102 return S_OK;
1105 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_P010(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1107 const SubPicDesc& src = m_spd;
1108 SubPicDesc dst = *pTarget; // copy, because we might modify it
1110 CRect rs(*pSrc), rd(*pDst);
1112 if(dst.h < 0) {
1113 dst.h = -dst.h;
1114 rd.bottom = dst.h - rd.bottom;
1115 rd.top = dst.h - rd.top;
1118 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1119 return E_INVALIDARG;
1122 int w = rs.Width(), h = rs.Height();
1125 BYTE* s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1126 BYTE* d = static_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1128 if(rd.top > rd.bottom) {
1129 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1131 dst.pitch = -dst.pitch;
1134 for(ptrdiff_t i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1136 BYTE* s2 = s;
1137 BYTE* s2end = s2 + w*4;
1138 WORD* d2 = reinterpret_cast<WORD*>(d);
1139 for(; s2 < s2end; s2 += 4, d2++)
1141 if(s2[3] < 0xff) {
1142 d2[0] = ((d2[0]*s2[3])>>8) + (s2[1]<<8);
1147 //UV
1148 int h2 = h/2;
1149 if(!dst.pitchUV)
1151 dst.pitchUV = abs(dst.pitch);
1153 if(!dst.bitsU || !dst.bitsV)
1155 dst.bitsU = static_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1156 dst.bitsV = dst.bitsU + 2;
1158 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
1159 if(rd.top > rd.bottom)
1161 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
1162 dst.pitchUV = -dst.pitchUV;
1165 s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1167 d = ddUV;
1168 int pitch = src.pitch;
1169 for(int j = 0; j < h2; j++, s += 2*src.pitch, d += dst.pitchUV )
1171 BYTE* s2 = s;
1172 WORD* d2=reinterpret_cast<WORD*>(d);
1173 WORD* d2_end = reinterpret_cast<WORD*>(d+2*w);
1174 for( ; d2<d2_end; s2+=8, d2+=2)
1176 unsigned int ia = (
1177 s2[3]+ s2[3+4]+
1178 s2[3+src.pitch]+s2[3+4+src.pitch]);
1179 if( ia!=0xFF*4 )
1181 d2[0] = (((d2[0])*ia)>>10) + ((s2[0] + s2[0+src.pitch])<<7);
1182 d2[1] = (((d2[1])*ia)>>10) + ((s2[4] + s2[4+src.pitch])<<7);
1187 return S_OK;
1190 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_Yv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1192 const SubPicDesc& src = m_spd;
1193 SubPicDesc dst = *pTarget; // copy, because we might modify it
1195 CRect rs(*pSrc), rd(*pDst);
1197 if(dst.h < 0) {
1198 dst.h = -dst.h;
1199 rd.bottom = dst.h - rd.bottom;
1200 rd.top = dst.h - rd.top;
1203 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1204 return E_INVALIDARG;
1207 int w = rs.Width(), h = rs.Height();
1209 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1210 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1212 if(rd.top > rd.bottom) {
1213 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1215 dst.pitch = -dst.pitch;
1218 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1219 BYTE* s2 = s;
1220 BYTE* s2end = s2 + w*4;
1221 BYTE* d2 = d;
1222 for(; s2 < s2end; s2 += 4, d2++) {
1223 if(s2[3] < 0xff) {
1224 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1228 dst.pitch = abs(dst.pitch);
1230 int h2 = h/2;
1232 if(!dst.pitchUV) {
1233 dst.pitchUV = dst.pitch/2;
1236 BYTE* ss[2];
1237 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1238 ss[1] = ss[0] + 4;
1240 if(!dst.bitsU || !dst.bitsV) {
1241 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1242 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
1244 if(dst.type == MSP_YV12) {
1245 BYTE* p = dst.bitsU;
1246 dst.bitsU = dst.bitsV;
1247 dst.bitsV = p;
1251 BYTE* dd[2];
1252 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
1253 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
1255 if(rd.top > rd.bottom) {
1256 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1257 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1258 dst.pitchUV = -dst.pitchUV;
1261 for(ptrdiff_t i = 0; i < 2; i++) {
1262 s = ss[i];
1263 d = dd[i];
1264 BYTE* is = ss[1-i];
1265 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, is += src.pitch*2) {
1266 BYTE* s2 = s;
1267 BYTE* s2end = s2 + w*4;
1268 BYTE* d2 = d;
1269 BYTE* is2 = is;
1270 for(; s2 < s2end; s2 += 8, d2++, is2 += 8) {
1271 unsigned int ia = (s2[3]+s2[3+src.pitch]+is2[3]+is2[3+src.pitch])>>2;
1272 if(ia < 0xff) {
1273 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1279 return S_OK;
1282 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_Nv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1284 const SubPicDesc& src = m_spd;
1285 SubPicDesc dst = *pTarget; // copy, because we might modify it
1287 CRect rs(*pSrc), rd(*pDst);
1289 if(dst.h < 0) {
1290 dst.h = -dst.h;
1291 rd.bottom = dst.h - rd.bottom;
1292 rd.top = dst.h - rd.top;
1295 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1296 return E_INVALIDARG;
1299 int w = rs.Width(), h = rs.Height();
1301 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1302 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1304 if(rd.top > rd.bottom) {
1305 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1307 dst.pitch = -dst.pitch;
1310 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1311 BYTE* s2 = s;
1312 BYTE* s2end = s2 + w*4;
1313 BYTE* d2 = d;
1314 for(; s2 < s2end; s2 += 4, d2++) {
1315 if(s2[3] < 0xff) {
1316 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1320 dst.pitch = abs(dst.pitch);
1322 int h2 = h/2;
1324 if(!dst.pitchUV) {
1325 dst.pitchUV = dst.pitch;
1328 BYTE* ss[2];
1329 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1330 ss[1] = ss[0] + 4;
1332 if(!dst.bitsU || !dst.bitsV) {
1333 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1334 dst.bitsV = dst.bitsU + 1;
1336 if(dst.type == MSP_NV21) {
1337 BYTE* p = dst.bitsU;
1338 dst.bitsU = dst.bitsV;
1339 dst.bitsV = p;
1343 BYTE* dd[2];
1344 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1345 dd[1] = dd[0]+1;
1347 if(rd.top > rd.bottom) {
1348 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1349 dd[1] = dd[0]+1;
1350 dst.pitchUV = -dst.pitchUV;
1353 for(ptrdiff_t i = 0; i < 2; i++) {
1354 s = ss[i];
1355 d = dd[i];
1356 BYTE* is = ss[1-i];
1357 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, is += src.pitch*2) {
1358 BYTE* s2 = s;
1359 BYTE* s2end = s2 + w*4;
1360 BYTE* d2 = d;
1361 BYTE* is2 = is;
1362 for(; s2 < s2end; s2 += 8, d2+=2, is2 += 8) {
1363 unsigned int ia = (s2[3]+s2[3+src.pitch]+is2[3]+is2[3+src.pitch])>>2;
1364 if(ia < 0xff) {
1365 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1371 return S_OK;
1374 STDMETHODIMP CMemSubPic::AlphaBltAnv12_P010( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1376 //fix me: check colorspace and log error
1377 const SubPicDesc& src = m_spd;
1378 SubPicDesc dst = *pTarget; // copy, because we might modify it
1380 CRect rs(*pSrc), rd(*pDst);
1381 if(dst.h < 0)
1383 dst.h = -dst.h;
1384 rd.bottom = dst.h - rd.bottom;
1385 rd.top = dst.h - rd.top;
1387 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1388 return E_INVALIDARG;
1390 int w = rs.Width(), h = rs.Height();
1391 bool bottom_down = rd.top > rd.bottom;
1393 BYTE* d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1394 if(bottom_down)
1396 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left*2;
1397 dst.pitch = -dst.pitch;
1400 //dst.pitch = abs(dst.pitch);
1401 int h2 = h/2;
1402 if(!dst.pitchUV)
1404 dst.pitchUV = abs(dst.pitch);
1406 dst.bitsU = reinterpret_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1407 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
1408 if(bottom_down)
1410 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
1411 dst.pitchUV = -dst.pitchUV;
1414 BYTE* src_origin= reinterpret_cast<BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1415 BYTE *s = src_origin;
1417 // equivalent:
1418 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
1419 // && (reinterpret_cast<intptr_t>(d2)&15)==0 )
1420 if( ((reinterpret_cast<intptr_t>(s) | static_cast<intptr_t>(src.pitch) |
1421 reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
1423 for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1425 BYTE* sa = s;
1426 BYTE* s2 = s + src.pitch*src.h;
1427 BYTE* s2end_mod16 = s2 + (w&~15);
1428 BYTE* s2end = s2 + w;
1429 BYTE* d2 = d;
1431 for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=32)
1433 mix_16_y_p010_sse2(d2, s2, sa);
1435 for( WORD* d3=reinterpret_cast<WORD*>(d2); s2 < s2end; s2++, sa++, d3++)
1437 if(sa[0] < 0xff)
1439 d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
1444 else //fix me: only a workaround for non-mod-16 size video
1446 for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1448 BYTE* sa = s;
1449 BYTE* s2 = s + src.pitch*src.h;
1450 BYTE* s2end_mod16 = s2 + (w&~15);
1451 BYTE* s2end = s2 + w;
1452 WORD* d2 = reinterpret_cast<WORD*>(d);
1453 for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
1455 if(sa[0] < 0xff)
1457 d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
1463 d = ddUV;
1464 BYTE* sa = src_origin;
1465 BYTE* s_uv = src_origin + src.pitch*src.h*2;//UV
1466 if( ((reinterpret_cast<intptr_t>(sa) | static_cast<intptr_t>(src.pitch) |
1467 reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
1469 for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1471 BYTE* s_u2 = s_uv;
1472 BYTE* sa2 = sa;
1473 BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1474 BYTE* s_u2end = s_u2 + w;
1475 BYTE* d2 = d;
1477 for(; s_u2 < s_u2end_mod16; s_u2+=16, sa2+=16, d2+=32)
1479 mix_16_uv_p010_sse2(d2, s_u2, sa2, src.pitch);
1482 for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1484 unsigned int ia = (
1485 sa2[0]+ sa2[1]+
1486 sa2[0+src.pitch]+sa2[1+src.pitch]);
1487 if( ia!=0xFF*4 )
1489 d3[0] = (((d3[0])*ia)>>10) + (s_u2[0]<<8);
1490 d3[1] = (((d3[1])*ia)>>10) + (s_u2[1]<<8);
1495 else
1497 for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1499 BYTE* s_u2 = s_uv;
1500 BYTE* sa2 = sa;
1501 BYTE* s_u2end = s_u2 + w;
1502 BYTE* d2 = d;
1504 for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1506 unsigned int ia = (
1507 sa2[0]+ sa2[1]+
1508 sa2[0+src.pitch]+sa2[1+src.pitch]);
1509 if( ia!=0xFF*4 )
1511 d3[0] = (((d3[0])*ia)>>10) + (s_u2[0]<<8);
1512 d3[1] = (((d3[1])*ia)>>10) + (s_u2[1]<<8);
1517 __asm emms;
1520 STDMETHODIMP CMemSubPic::AlphaBltAnv12_Nvxx( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1522 //fix me: check colorspace and log error
1523 const SubPicDesc& src = m_spd;
1524 SubPicDesc dst = *pTarget; // copy, because we might modify it
1526 CRect rs(*pSrc), rd(*pDst);
1527 if(dst.h < 0)
1529 dst.h = -dst.h;
1530 rd.bottom = dst.h - rd.bottom;
1531 rd.top = dst.h - rd.top;
1533 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1534 return E_INVALIDARG;
1536 int w = rs.Width(), h = rs.Height();
1537 bool bottom_down = rd.top > rd.bottom;
1539 BYTE* d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left;
1540 if(bottom_down)
1542 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left;
1543 dst.pitch = -dst.pitch;
1546 //dst.pitch = abs(dst.pitch);
1547 int h2 = h/2;
1548 if(!dst.pitchUV)
1550 dst.pitchUV = abs(dst.pitch);
1552 if(!dst.bitsU)
1554 dst.bitsU = reinterpret_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1556 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1557 if(bottom_down)
1559 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1560 dst.pitchUV = -dst.pitchUV;
1563 BYTE* sa= reinterpret_cast<BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1565 BYTE* s_uv = sa + src.pitch*src.h*2;//UV
1567 AlphaBltYv12Luma( d, dst.pitch, w, h, sa + src.pitch*src.h, sa, src.pitch );
1568 if( ((reinterpret_cast<intptr_t>(sa) | static_cast<intptr_t>(src.pitch) |
1569 reinterpret_cast<intptr_t>(ddUV) | static_cast<intptr_t>(dst.pitchUV) ) & 15 )==0 )
1571 BYTE* d = ddUV;
1572 int pitch = src.pitch;
1573 for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1575 BYTE* s_u2 = s_uv;
1576 BYTE* sa2 = sa;
1577 BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1578 BYTE* s_u2end = s_u2 + w;
1579 BYTE* d2 = d;
1581 for(; s_u2 < s_u2end_mod16; s_u2+=16, sa2+=16, d2+=16)
1583 mix_16_uv_nvxx_sse2(d2, s_u2, sa2, src.pitch);
1585 for( BYTE* d3=d2; s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1587 unsigned int ia = (
1588 sa2[0]+ sa2[1]+
1589 sa2[0+src.pitch]+sa2[1+src.pitch]);
1590 if( ia!=0xFF*4 )
1592 d3[0] = (((d3[0])*ia)>>10) + s_u2[0];
1593 d3[1] = (((d3[1])*ia)>>10) + s_u2[1];
1598 else
1600 BYTE* d = ddUV;
1601 int pitch = src.pitch;
1602 for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1604 BYTE* s_u2 = s_uv;
1605 BYTE* sa2 = sa;
1606 BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1607 BYTE* s_u2end = s_u2 + w;
1608 BYTE* d2 = d;
1610 for( BYTE* d3=d2; s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1612 unsigned int ia = (
1613 sa2[0]+ sa2[1]+
1614 sa2[0+src.pitch]+sa2[1+src.pitch]);
1615 if( ia!=0xFF*4 )
1617 d3[0] = (((d3[0])*ia)>>10) + s_u2[0];
1618 d3[1] = (((d3[1])*ia)>>10) + s_u2[1];
1624 __asm emms;
1627 STDMETHODIMP CMemSubPic::SetDirtyRectEx(CAtlList<CRect>* dirtyRectList )
1629 //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1630 if(dirtyRectList!=NULL)
1632 POSITION pos = dirtyRectList->GetHeadPosition();
1633 if(m_spd.type == MSP_AYUV_PLANAR || m_alpha_blt_dst_type==MSP_IYUV || m_alpha_blt_dst_type==MSP_YV12
1634 || m_alpha_blt_dst_type==MSP_P010 || m_alpha_blt_dst_type==MSP_P016
1635 || m_alpha_blt_dst_type==MSP_NV12 || m_alpha_blt_dst_type==MSP_NV21 )
1637 while(pos!=NULL)
1639 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1640 cRectSrc.left &= ~15;
1641 cRectSrc.right = (cRectSrc.right+15)&~15;
1642 if(cRectSrc.right>m_spd.w)
1644 cRectSrc.right = m_spd.w;
1646 cRectSrc.top &= ~1;
1647 cRectSrc.bottom = (cRectSrc.bottom+1)&~1;
1650 else if(m_spd.type == MSP_XY_AUYV || m_alpha_blt_dst_type==MSP_YUY2)
1652 while(pos!=NULL)
1654 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1655 cRectSrc.left &= ~3;
1656 cRectSrc.right = (cRectSrc.right+3)&~3;
1660 return __super::SetDirtyRectEx(dirtyRectList);
1664 // CMemSubPicAllocator
1667 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type, SIZE maxsize, int type/*=-1*/)
1668 : CSubPicExAllocatorImpl(maxsize, false, false)
1669 , m_alpha_blt_dst_type(alpha_blt_dst_type)
1670 , m_maxsize(maxsize)
1671 , m_type(type)
1673 if(m_type==-1)
1675 switch(alpha_blt_dst_type)
1677 case MSP_YUY2:
1678 m_type = MSP_XY_AUYV;
1679 break;
1680 case MSP_AYUV:
1681 m_type = MSP_AYUV;
1682 break;
1683 case MSP_IYUV:
1684 case MSP_YV12:
1685 case MSP_P010:
1686 case MSP_P016:
1687 case MSP_NV12:
1688 case MSP_NV21:
1689 m_type = MSP_AYUV_PLANAR;
1690 break;
1691 default:
1692 m_type = MSP_RGBA;
1693 break;
1698 // ISubPicAllocatorImpl
1700 bool CMemSubPicAllocator::AllocEx(bool fStatic, ISubPicEx** ppSubPic)
1702 if(!ppSubPic) {
1703 return false;
1705 SubPicDesc spd;
1706 spd.w = m_maxsize.cx;
1707 spd.h = m_maxsize.cy;
1708 spd.bpp = 32;
1709 spd.pitch = (spd.w*spd.bpp)>>3;
1710 spd.type = m_type;
1711 spd.bits = DNew BYTE[spd.pitch*spd.h];
1712 if(!spd.bits) {
1713 return false;
1715 *ppSubPic = DNew CMemSubPic(spd, m_alpha_blt_dst_type);
1716 if(!(*ppSubPic)) {
1717 return false;
1719 (*ppSubPic)->AddRef();
1720 return true;