SSE2 optimization for NV12/NV21 UV alpha blending.
[xy_vsfilter.git] / src / subpic / MemSubPic.cpp
blobf328e89c628e5b14b028295595f3d230ada726f2
1 /*
2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
22 #include "stdafx.h"
23 #include "MemSubPic.h"
24 #include "color_conv_table.h"
26 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
27 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
28 m128_2 = _mm_slli_epi16(m128_1, 8); \
29 m128_1 = _mm_srli_epi16(m128_1, 8); \
30 m128_2 = _mm_srli_epi16(m128_2, 8); \
31 m128_1 = _mm_avg_epu8(m128_1, m128_2);
33 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
35 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
36 m128_2 = _mm_slli_epi16(m128_1, 8); \
37 __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
38 m128_2 = _mm_or_si128(m128_2, m128_3);\
39 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
42 void subsample_and_interlace_2_line_c(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
44 const BYTE* end = u + w;
45 for (;u<end;dst+=2,u+=2,v+=2)
47 dst[0] = (u[0] + u[0+pitch] + 1)/2;
48 int tmp1 = (u[1] + u[1+pitch] + 1)/2;
49 dst[0] = (dst[0] + tmp1 + 1)/2;
50 dst[1] = (v[0] + v[0+pitch] + 1)/2;
51 tmp1 = (v[1] + v[1+pitch] + 1)/2;
52 dst[1] = (dst[1] + tmp1 + 1)/2;
56 __forceinline void subsample_and_interlace_2_line_sse2(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
58 const BYTE* end = u + w;
59 for (;u<end;dst+=16,u+=16,v+=16)
61 __m128i u_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(u) );
62 __m128i u_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(u+pitch) );
63 __m128i v_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(v) );
64 __m128i v_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(v+pitch) );
65 AVERAGE_4_PIX_INTRINSICS(u_1, u_2);
66 AVERAGE_4_PIX_INTRINSICS(v_1, v_2);
67 u_1 = _mm_packus_epi16(u_1, u_1);
68 v_1 = _mm_packus_epi16(v_1, v_1);
69 u_1 = _mm_unpacklo_epi8(u_1, v_1);
71 _mm_store_si128( reinterpret_cast<__m128i*>(dst), u_1 );
75 static __forceinline void pix_alpha_blend_yv12_luma_sse2(byte* dst, const byte* alpha, const byte* sub)
77 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
78 __m128i alpha128 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
79 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(sub) );
80 __m128i zero = _mm_setzero_si128();
82 __m128i ones = _mm_cmpeq_epi32(ones,ones);
83 ones = _mm_cmpeq_epi8(ones,alpha128);
85 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
86 __m128i alpha_lo128 = _mm_unpacklo_epi8(alpha128, zero);
88 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
90 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha_lo128);
91 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
92 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
94 dst128 = _mm_unpackhi_epi8(dst128, zero);
95 alpha128 = _mm_unpackhi_epi8(alpha128, zero);
97 ones2 = _mm_unpackhi_epi8(ones, zero);
99 dst128 = _mm_mullo_epi16(dst128, alpha128);
100 dst128 = _mm_adds_epu16(dst128, ones2);
101 dst128 = _mm_srli_epi16(dst128, 8);
102 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
104 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
105 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
108 /***
109 * output not exactly identical to pix_alpha_blend_yv12_chroma
111 static __forceinline void pix_alpha_blend_yv12_chroma_sse2(byte* dst, const byte* src, const byte* alpha, int src_pitch)
113 __m128i zero = _mm_setzero_si128();
114 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
115 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha+src_pitch) );
116 __m128i dst128 = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(dst) );
118 __m128i sub128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
119 __m128i sub128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src+src_pitch) );
121 AVERAGE_4_PIX_INTRINSICS(alpha128_1, alpha128_2);
123 __m128i ones = _mm_cmpeq_epi32(ones, ones);
124 ones = _mm_cmpeq_epi8(ones, alpha128_1);
126 dst128 = _mm_unpacklo_epi8(dst128, zero);
127 __m128i dst128_2 = _mm_and_si128(dst128, ones);
129 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
130 dst128 = _mm_adds_epu16(dst128, dst128_2);
132 dst128 = _mm_srli_epi16(dst128, 8);
134 AVERAGE_4_PIX_INTRINSICS(sub128_1, sub128_2);
136 dst128 = _mm_adds_epi16(dst128, sub128_1);
137 dst128 = _mm_packus_epi16(dst128, dst128);
139 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst), dst128 );
142 static __forceinline void pix_alpha_blend_yv12_chroma(byte* dst, const byte* src, const byte* alpha, int src_pitch)
144 unsigned int ia = (alpha[0]+alpha[1]+
145 alpha[0+src_pitch]+alpha[1+src_pitch])>>2;
146 if(ia!=0xff)
148 *dst= (((*dst)*ia)>>8) + ((src[0] +src[1]+
149 src[src_pitch]+src[1+src_pitch] )>>2);
153 static void AlphaBltYv12Luma(byte* dst, int dst_pitch,
154 int w, int h,
155 const byte* sub, const byte* alpha, int sub_pitch)
157 if( ((reinterpret_cast<intptr_t>(alpha) | static_cast<intptr_t>(sub_pitch) |
158 reinterpret_cast<intptr_t>(dst) | static_cast<intptr_t>(dst_pitch) ) & 15 )==0 )
160 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
162 const BYTE* sa = alpha;
163 const BYTE* s2 = sub;
164 const BYTE* s2end_mod16 = s2 + (w&~15);
165 const BYTE* s2end = s2 + w;
166 BYTE* d2 = dst;
168 for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=16)
170 pix_alpha_blend_yv12_luma_sse2(d2, sa, s2);
172 for(; s2 < s2end; s2++, sa++, d2++)
174 if(sa[0] < 0xff)
176 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
181 else //fix me: only a workaround for non-mod-16 size video
183 for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
185 const BYTE* sa = alpha;
186 const BYTE* s2 = sub;
187 const BYTE* s2end_mod16 = s2 + (w&~15);
188 const BYTE* s2end = s2 + w;
189 BYTE* d2 = dst;
190 for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
192 if(sa[0] < 0xff)
194 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
195 d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
202 static void AlphaBltYv12Chroma(byte* dst, int dst_pitch,
203 int w, int chroma_h,
204 const byte* sub_chroma, const byte* alpha, int sub_pitch)
206 if( ((reinterpret_cast<intptr_t>(sub_chroma) |
207 //reinterpret_cast<intptr_t>(dst) |
208 reinterpret_cast<intptr_t>(alpha) | static_cast<intptr_t>(sub_pitch)
209 //| (static_cast<intptr_t>(dst_pitch)&7)
210 ) & 15 )==0 )
212 int pitch = sub_pitch;
213 for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
215 const BYTE* s2 = sub_chroma;
216 const BYTE* sa2 = alpha;
217 const BYTE* s2end_mod16 = s2 + (w&~15);
218 const BYTE* s2end = s2 + w;
219 BYTE* d2 = dst;
221 for(; s2 < s2end_mod16; s2 += 16, sa2 += 16, d2+=8)
223 pix_alpha_blend_yv12_chroma_sse2(d2, s2, sa2, sub_pitch);
225 for(; s2 < s2end; s2+=2, sa2+=2, d2++)
227 pix_alpha_blend_yv12_chroma(d2, s2, sa2, sub_pitch);
231 else//fix me: only a workaround for non-mod-16 size video
233 for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
235 const BYTE* s2 = sub_chroma;
236 const BYTE* sa2 = alpha;
237 const BYTE* s2end_mod16 = s2 + (w&~15);
238 const BYTE* s2end = s2 + w;
239 BYTE* d2 = dst;
240 for(; s2 < s2end; s2 += 2, sa2 += 2, d2++)
242 pix_alpha_blend_yv12_chroma(d2, s2, sa2, sub_pitch);
248 __forceinline void mix_16_y_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
250 //important!
251 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
252 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
253 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
255 __m128i alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
256 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
258 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
259 //so we do it another way
260 //first, (alpha<<8)+0xff
261 __m128i ones = _mm_setzero_si128();
262 ones = _mm_cmpeq_epi16(dst_y, ones);
264 __m128i ones2 = _mm_cmpeq_epi32(ones2,ones2);
265 ones = _mm_xor_si128(ones, ones2);
266 ones = _mm_srli_epi16(ones, 15);
267 ones = _mm_and_si128(ones, lo);
269 dst_y = _mm_mulhi_epu16(dst_y, lo);
270 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
272 lo = _mm_setzero_si128();
273 lo = _mm_unpacklo_epi8(lo, src_y);
274 dst_y = _mm_adds_epu16(dst_y, lo);
275 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
277 dst += 16;
278 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
280 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
282 ones = _mm_setzero_si128();
283 ones = _mm_cmpeq_epi16(dst_y, ones);
284 ones = _mm_xor_si128(ones, ones2);
285 ones = _mm_srli_epi16(ones, 15);
286 ones = _mm_and_si128(ones, lo);
288 dst_y = _mm_mulhi_epu16(dst_y, lo);
289 dst_y = _mm_adds_epu16(dst_y, ones);
291 lo = _mm_setzero_si128();
292 lo = _mm_unpackhi_epi8(lo, src_y);
293 dst_y = _mm_adds_epu16(dst_y, lo);
294 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
297 //for test only
298 void mix_16_y_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
300 WORD* dst_word = reinterpret_cast<WORD*>(dst);
301 for (int i=0;i<16;i++)
303 if (src_alpha[i]!=0xff)
305 dst_word[i] = ((dst_word[i] *src_alpha[i])>>8) + (src[i]<<8);
310 __forceinline void mix_16_uv_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
312 //important!
313 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
314 __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
316 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
317 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
319 AVERAGE_4_PIX_INTRINSICS_2(alpha, alpha2);
321 __m128i alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
322 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
324 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
325 //so we do it another way
326 //first, (alpha<<8)+0xff
327 __m128i ones = _mm_setzero_si128();
328 ones = _mm_cmpeq_epi16(dst_y, ones);
330 __m128i ones2 = _mm_cmpeq_epi32(ones2,ones2);
331 ones = _mm_xor_si128(ones, ones2);
332 ones = _mm_srli_epi16(ones, 15);
333 ones = _mm_and_si128(ones, lo);
335 dst_y = _mm_mulhi_epu16(dst_y, lo);
336 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
338 lo = _mm_setzero_si128();
339 lo = _mm_unpacklo_epi8(lo, src_y);
340 dst_y = _mm_adds_epu16(dst_y, lo);
341 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
343 dst += 16;
344 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
346 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
348 ones = _mm_setzero_si128();
349 ones = _mm_cmpeq_epi16(dst_y, ones);
350 ones = _mm_xor_si128(ones, ones2);
351 ones = _mm_srli_epi16(ones, 15);
352 ones = _mm_and_si128(ones, lo);
354 dst_y = _mm_mulhi_epu16(dst_y, lo);
355 dst_y = _mm_adds_epu16(dst_y, ones);
357 lo = _mm_setzero_si128();
358 lo = _mm_unpackhi_epi8(lo, src_y);
359 dst_y = _mm_adds_epu16(dst_y, lo);
360 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
363 //for test only
364 void mix_16_uv_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
366 WORD* dst_word = reinterpret_cast<WORD*>(dst);
367 for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst_word+=2)
369 unsigned int ia = (
370 (src_alpha[0]+src_alpha[0+pitch]+1)/2+
371 (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
372 if( ia!=0xFF )
374 int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);
375 if(tmp>0xffff) tmp = 0xffff;
376 dst_word[0] = tmp;
377 tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);
378 if(tmp>0xffff) tmp = 0xffff;
379 dst_word[1] = tmp;
384 __forceinline void mix_16_uv_nvxx_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
386 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
387 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
388 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
389 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
391 AVERAGE_4_PIX_INTRINSICS_2(alpha128_1, alpha128_2);
392 __m128i zero = _mm_setzero_si128();
394 __m128i ones = _mm_cmpeq_epi32(ones,ones);
395 ones = _mm_cmpeq_epi8(ones,alpha128_1);
397 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
398 alpha128_2 = _mm_unpacklo_epi8(alpha128_1, zero);
400 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
402 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha128_2);
403 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
404 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
406 dst128 = _mm_unpackhi_epi8(dst128, zero);
407 alpha128_1 = _mm_unpackhi_epi8(alpha128_1, zero);
409 ones2 = _mm_unpackhi_epi8(ones, zero);
411 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
412 dst128 = _mm_adds_epu16(dst128, ones2);
413 dst128 = _mm_srli_epi16(dst128, 8);
414 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
416 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
417 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
420 //for test only
421 void mix_16_uv_nvxx_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
423 for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst+=2)
425 unsigned int ia = (
426 (src_alpha[0]+src_alpha[0+pitch]+1)/2+
427 (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
428 if( ia!=0xFF )
430 dst[0] = (((dst[0])*ia)>>8) + src[0];
431 dst[1] = (((dst[1])*ia)>>8) + src[1];
437 // CMemSubPic
440 CMemSubPic::CMemSubPic(SubPicDesc& spd, int alpha_blt_dst_type)
441 : m_spd(spd), m_alpha_blt_dst_type(alpha_blt_dst_type)
443 m_maxsize.SetSize(spd.w, spd.h);
444 // m_rcDirty.SetRect(0, 0, spd.w, spd.h);
445 CRect allSpd(0,0,spd.w, spd.h);
446 m_rectListDirty.AddTail(allSpd);
449 CMemSubPic::~CMemSubPic()
451 delete [] m_spd.bits, m_spd.bits = NULL;
454 // ISubPic
456 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
458 return (void*)&m_spd;
461 STDMETHODIMP CMemSubPic::GetDesc(SubPicDesc& spd) const
463 spd.type = m_spd.type;
464 spd.w = m_size.cx;
465 spd.h = m_size.cy;
466 spd.bpp = m_spd.bpp;
467 spd.pitch = m_spd.pitch;
468 spd.bits = m_spd.bits;
469 spd.bitsU = m_spd.bitsU;
470 spd.bitsV = m_spd.bitsV;
471 spd.vidrect = m_vidrect;
472 return S_OK;
475 STDMETHODIMP CMemSubPic::CopyTo(ISubPicEx* pSubPic)
477 HRESULT hr;
478 if(FAILED(hr = __super::CopyTo(pSubPic))) {
479 return hr;
482 SubPicDesc src, dst;
483 if(FAILED(GetDesc(src)) || FAILED(pSubPic->GetDesc(dst))) {
484 return E_FAIL;
486 while(!m_rectListDirty.IsEmpty())
488 CRect& cRect = m_rectListDirty.GetHead();
489 int w = cRect.Width(), h = cRect.Height();
490 BYTE* s = (BYTE*)src.bits + src.pitch*cRect.top + cRect.left*4;
491 BYTE* d = (BYTE*)dst.bits + dst.pitch*cRect.top + cRect.left*4;
492 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
493 memcpy(d, s, w*4);
495 return S_OK;
498 STDMETHODIMP CMemSubPic::ClearDirtyRect(DWORD color)
500 if(m_rectListDirty.IsEmpty()) {
501 return S_OK;
503 while(!m_rectListDirty.IsEmpty())
505 //pDirtyRect = m_rectListDirty.RemoveHead();
506 CRect& dirtyRect = m_rectListDirty.RemoveTail();
507 BYTE* p = (BYTE*)m_spd.bits + m_spd.pitch*(dirtyRect.top) + dirtyRect.left*(m_spd.bpp>>3);
508 int w = dirtyRect.Width();
509 if(m_spd.type!=MSP_AYUV_PLANAR)
511 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
513 #ifdef _WIN64
514 memsetd(p, color, w*4); // nya
515 #else
516 __asm
518 mov eax, color
519 mov ecx, w
520 mov edi, p
522 rep stosd
525 #endif
528 else
530 ///TODO:
531 ///FIX ME
532 for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
534 // memsetd(p, 0, m_rcDirty.Width());
535 //DbgLog((LOG_TRACE, 3, "w:%d", w));
536 //w = pDirtyRect->Width();
537 memset(p, 0xFF, w);
538 memset(p+m_spd.h*m_spd.pitch, 0, w);
539 memset(p+m_spd.h*m_spd.pitch*2, 0, w);
540 memset(p+m_spd.h*m_spd.pitch*3, 0, w);
544 m_rectListDirty.RemoveAll();
545 return S_OK;
548 STDMETHODIMP CMemSubPic::Lock(SubPicDesc& spd)
550 return GetDesc(spd);
553 STDMETHODIMP CMemSubPic::Unlock( CAtlList<CRect>* dirtyRectList )
555 int src_type = m_spd.type;
556 int dst_type = m_alpha_blt_dst_type;
557 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
558 dst_type == MSP_RGB24 ||
559 dst_type == MSP_RGB16 ||
560 dst_type == MSP_RGB15))
562 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
564 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
566 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
567 dst_type == MSP_YV12 ||
568 dst_type == MSP_P010 ||
569 dst_type == MSP_P016 ||
570 dst_type == MSP_NV12 ||
571 dst_type == MSP_NV21)))
573 return UnlockOther(dirtyRectList);
575 else if(src_type==MSP_RGBA && (dst_type == MSP_YUY2 ||
576 dst_type == MSP_AYUV || //ToDo: fix me MSP_AYUV
577 dst_type == MSP_IYUV ||
578 dst_type == MSP_YV12 ||
579 dst_type == MSP_NV12 ||
580 dst_type == MSP_NV21 ||
581 dst_type == MSP_P010 ||
582 dst_type == MSP_P016))
584 return UnlockRGBA_YUV(dirtyRectList);
586 return E_NOTIMPL;
589 STDMETHODIMP CMemSubPic::UnlockOther(CAtlList<CRect>* dirtyRectList)
591 SetDirtyRectEx(dirtyRectList);
592 if(m_rectListDirty.IsEmpty()) {
593 return S_OK;
596 POSITION pos = m_rectListDirty.GetHeadPosition();
597 while(pos!=NULL)
599 const CRect& cRect = m_rectListDirty.GetNext(pos);
600 int w = cRect.Width(), h = cRect.Height();
601 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*(cRect.top) + cRect.left*4;
602 BYTE* bottom = top + m_spd.pitch*h;
603 if(m_alpha_blt_dst_type == MSP_RGB16)
605 for(; top < bottom ; top += m_spd.pitch)
607 DWORD* s = (DWORD*)top;
608 DWORD* e = s + w;
609 for(; s < e; s++)
611 *s = ((*s>>3)&0x1f000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
612 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
616 else if(m_alpha_blt_dst_type == MSP_RGB15)
618 for(; top < bottom; top += m_spd.pitch)
620 DWORD* s = (DWORD*)top;
621 DWORD* e = s + w;
622 for(; s < e; s++)
624 *s = ((*s>>3)&0x1f000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
625 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
629 else if(m_alpha_blt_dst_type == MSP_YUY2)
631 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top, m_spd.pitch*(h-1)) );
633 for(BYTE* tempTop=top; tempTop < bottom ; tempTop += m_spd.pitch)
635 BYTE* s = tempTop;
636 BYTE* e = s + w*4;
637 for(; s < e; s+=8) // AUYV AUYV -> AxYU AxYV
639 s[4] = (s[0] + s[4])>>1;
640 s[0] = (s[2] + s[6])>>1;
644 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top, m_spd.pitch*(h-1)) );
646 else if(m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV || m_alpha_blt_dst_type == MSP_YV12 )
648 //nothing to do
650 else if ( m_alpha_blt_dst_type == MSP_P010 || m_alpha_blt_dst_type == MSP_P016
651 || m_alpha_blt_dst_type == MSP_NV12 )
653 SubsampleAndInterlace(cRect, true);
655 else if( m_alpha_blt_dst_type == MSP_NV21 )
657 SubsampleAndInterlace(cRect, false);
660 return S_OK;
663 STDMETHODIMP CMemSubPic::UnlockRGBA_YUV(CAtlList<CRect>* dirtyRectList)
665 SetDirtyRectEx(dirtyRectList);
666 if(m_rectListDirty.IsEmpty()) {
667 return S_OK;
670 const ColorConvTable *conv_table = ColorConvTable::GetDefaultColorConvTable();
671 const int *c2y_yb = conv_table->c2y_yb;
672 const int *c2y_yg = conv_table->c2y_yg;
673 const int *c2y_yr = conv_table->c2y_yr;
674 const int cy_cy2 = conv_table->cy_cy2;
675 const int c2y_cu = conv_table->c2y_cu;
676 const int c2y_cv = conv_table->c2y_cv;
677 const int cy_cy = conv_table->cy_cy;
678 const unsigned char* Clip = conv_table->Clip;
680 POSITION pos = m_rectListDirty.GetHeadPosition();
681 while(pos!=NULL)
683 const CRect& cRect = m_rectListDirty.GetNext(pos);
684 int w = cRect.Width(), h = cRect.Height();
686 BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*cRect.top + cRect.left*4;
687 BYTE* bottom = top + m_spd.pitch*h;
689 if( m_alpha_blt_dst_type == MSP_YUY2 ||
690 m_alpha_blt_dst_type == MSP_YV12 ||
691 m_alpha_blt_dst_type == MSP_IYUV ||
692 m_alpha_blt_dst_type == MSP_P010 ||
693 m_alpha_blt_dst_type == MSP_P016 ||
694 m_alpha_blt_dst_type == MSP_NV12 ||
695 m_alpha_blt_dst_type == MSP_NV21) {
696 for(; top < bottom ; top += m_spd.pitch) {
697 BYTE* s = top;
698 BYTE* e = s + w*4;
699 for(; s < e; s+=8) { // ARGB ARGB -> AxYU AxYV
700 if((s[3]+s[7]) < 0x1fe) {
701 int a = 0x200 - (s[3]+s[7]);
702 a <<= 7;
703 // 0 <= a <= 0x10000
704 s[1] = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a + 0x8000) >> 16;
705 s[5] = (c2y_yb[s[4]] + c2y_yg[s[5]] + c2y_yr[s[6]] + 0x10*a + 0x8000) >> 16;
707 int scaled_y = (s[1]+s[5]-32) * cy_cy2;
709 s[0] = Clip[(((((s[0]+s[4])<<15) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
710 s[4] = Clip[(((((s[2]+s[6])<<15) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
711 } else {
712 s[1] = s[5] = 0;
713 s[0] = s[4] = 0;
718 else if(m_alpha_blt_dst_type == MSP_AYUV) {
719 for(; top < bottom ; top += m_spd.pitch) {
720 BYTE* s = top;
721 BYTE* e = s + w*4;
722 for(; s < e; s+=4) { // ARGB -> AYUV
723 if(s[3] < 0xff) {
724 int a = 0x100 - s[3];
725 a <<= 8;
726 // 0 <= a <= 0x10000
728 int y = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a + 0x8000) >> 16;
729 int scaled_y = (y-32) * cy_cy;
730 s[1] = Clip[((((s[0]<<16) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
731 s[0] = Clip[((((s[2]<<16) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
732 s[2] = y;
733 } else {
734 s[0] = s[1] = 0;
735 s[2] = 0;
741 return S_OK;
744 void CMemSubPic::SubsampleAndInterlace( const CRect& cRect, bool u_first )
746 //fix me: check alignment and log error
747 int w = cRect.Width(), h = cRect.Height();
748 BYTE* u_plan = reinterpret_cast<BYTE*>(m_spd.bits) + m_spd.pitch*m_spd.h*2;
749 BYTE* u_start = u_plan + m_spd.pitch*(cRect.top)+ cRect.left;
750 BYTE* v_start = u_start + m_spd.pitch*m_spd.h;
751 BYTE* dst = u_start;
752 if(!u_first)
754 BYTE* tmp = v_start;
755 v_start = u_start;
756 u_start = tmp;
758 for (int i=0;i<h;i+=2)
760 subsample_and_interlace_2_line_sse2(dst, u_start, v_start, w, m_spd.pitch);
761 u_start += 2*m_spd.pitch;
762 v_start += 2*m_spd.pitch;
763 dst += m_spd.pitch;
767 STDMETHODIMP CMemSubPic::AlphaBlt( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
769 if(!pSrc || !pDst || !pTarget) {
770 return E_POINTER;
772 int src_type = m_spd.type;
773 int dst_type = pTarget->type;
775 if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
776 dst_type == MSP_RGB24 ||
777 dst_type == MSP_RGB16 ||
778 dst_type == MSP_RGB15 ||
779 dst_type == MSP_RGBA ||
780 dst_type == MSP_YUY2 ||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
781 dst_type == MSP_AYUV ))
783 (src_type==MSP_XY_AUYV && dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
785 (src_type==MSP_AYUV && dst_type == MSP_AYUV)
787 (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
788 dst_type == MSP_YV12)) )
790 return AlphaBltOther(pSrc, pDst, pTarget);
792 else if ( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_NV12 ||
793 dst_type == MSP_NV21 ) )
795 return AlphaBltAnv12_Nvxx(pSrc, pDst, pTarget);
798 else if( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_P010 ||
799 dst_type == MSP_P016 ) )
801 return AlphaBltAnv12_P010(pSrc, pDst, pTarget);
803 else if( src_type==MSP_RGBA && (dst_type == MSP_IYUV ||
804 dst_type == MSP_YV12))
806 return AlphaBltAxyuAxyv_Yv12(pSrc, pDst, pTarget);
808 else if( src_type==MSP_RGBA && (dst_type == MSP_NV12||
809 dst_type == MSP_NV21))
811 return AlphaBltAxyuAxyv_Nv12(pSrc, pDst, pTarget);
813 else if( src_type==MSP_RGBA && (dst_type == MSP_P010 ||
814 dst_type == MSP_P016))
816 return AlphaBltAxyuAxyv_P010(pSrc, pDst, pTarget);
818 return E_NOTIMPL;
821 STDMETHODIMP CMemSubPic::AlphaBltOther(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
823 const SubPicDesc& src = m_spd;
824 SubPicDesc dst = *pTarget; // copy, because we might modify it
826 CRect rs(*pSrc), rd(*pDst);
827 if(dst.h < 0)
829 dst.h = -dst.h;
830 rd.bottom = dst.h - rd.bottom;
831 rd.top = dst.h - rd.top;
833 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
834 return E_INVALIDARG;
836 int w = rs.Width(), h = rs.Height();
837 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);//rs.left*4
838 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + ((rd.left*dst.bpp)>>3);
839 if(rd.top > rd.bottom)
841 if(dst.type == MSP_RGB32 || dst.type == MSP_RGB24
842 || dst.type == MSP_RGB16 || dst.type == MSP_RGB15
843 || dst.type == MSP_YUY2 || dst.type == MSP_AYUV)
845 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*dst.bpp>>3);
847 else if(dst.type == MSP_YV12 || dst.type == MSP_IYUV)
849 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*8>>3);
851 else
853 return E_NOTIMPL;
855 dst.pitch = -dst.pitch;
857 DbgLog((LOG_TRACE, 5, TEXT("w=%d h=%d"), w, h));
858 switch(dst.type)
860 case MSP_RGBA:
861 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
863 BYTE* s2 = s;
864 BYTE* s2end = s2 + w*4;
865 DWORD* d2 = (DWORD*)d;
866 for(; s2 < s2end; s2 += 4, d2++)
868 if(s2[3] < 0xff)
870 DWORD bd =0x00000100 -( (DWORD) s2[3]);
871 DWORD B = ((*((DWORD*)s2)&0x000000ff)<<8)/bd;
872 DWORD V = ((*((DWORD*)s2)&0x0000ff00)/bd)<<8;
873 DWORD R = (((*((DWORD*)s2)&0x00ff0000)>>8)/bd)<<16;
874 *d2 = B | V | R
875 | (0xff000000-(*((DWORD*)s2)&0xff000000))&0xff000000;
879 break;
880 case MSP_RGB32:
881 case MSP_AYUV: //ToDo: fix me MSP_VUYA indeed?
882 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
884 BYTE* s2 = s;
885 BYTE* s2end = s2 + w*4;
886 DWORD* d2 = (DWORD*)d;
887 for(; s2 < s2end; s2 += 4, d2++)
889 #ifdef _WIN64
890 DWORD ia = 256-s2[3];
891 if(s2[3] < 0xff) {
892 *d2 = ((((*d2&0x00ff00ff)*s2[3])>>8) + (((*((DWORD*)s2)&0x00ff00ff)*ia)>>8)&0x00ff00ff)
893 | ((((*d2&0x0000ff00)*s2[3])>>8) + (((*((DWORD*)s2)&0x0000ff00)*ia)>>8)&0x0000ff00);
895 #else
896 if(s2[3] < 0xff)
898 *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
899 | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00);
901 #endif
904 break;
905 case MSP_RGB24:
906 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
908 BYTE* s2 = s;
909 BYTE* s2end = s2 + w*4;
910 BYTE* d2 = d;
911 for(; s2 < s2end; s2 += 4, d2 += 3)
913 if(s2[3] < 0xff)
915 d2[0] = ((d2[0]*s2[3])>>8) + s2[0];
916 d2[1] = ((d2[1]*s2[3])>>8) + s2[1];
917 d2[2] = ((d2[2]*s2[3])>>8) + s2[2];
921 break;
922 case MSP_RGB16:
923 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
925 BYTE* s2 = s;
926 BYTE* s2end = s2 + w*4;
927 WORD* d2 = (WORD*)d;
928 for(; s2 < s2end; s2 += 4, d2++)
930 if(s2[3] < 0x1f)
932 *d2 = (WORD)((((((*d2&0xf81f)*s2[3])>>5) + (*(DWORD*)s2&0xf81f))&0xf81f)
933 | (((((*d2&0x07e0)*s2[3])>>5) + (*(DWORD*)s2&0x07e0))&0x07e0));
934 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
935 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
936 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
941 break;
942 case MSP_RGB15:
943 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
945 BYTE* s2 = s;
946 BYTE* s2end = s2 + w*4;
947 WORD* d2 = (WORD*)d;
948 for(; s2 < s2end; s2 += 4, d2++)
950 if(s2[3] < 0x1f)
952 *d2 = (WORD)((((((*d2&0x7c1f)*s2[3])>>5) + (*(DWORD*)s2&0x7c1f))&0x7c1f)
953 | (((((*d2&0x03e0)*s2[3])>>5) + (*(DWORD*)s2&0x03e0))&0x03e0));
954 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
955 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
956 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
961 break;
962 case MSP_YUY2:
963 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
965 unsigned int ia, c;
966 BYTE* s2 = s;
967 BYTE* s2end = s2 + w*4;
968 DWORD* d2 = (DWORD*)d;
969 for(; s2 < s2end; s2 += 8, d2++)
971 ia = (s2[3]+s2[7])>>1;
972 if(ia < 0xff)
974 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
975 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
976 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
977 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
978 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
980 ia = (ia<<24)|(s2[7]<<16)|(ia<<8)|s2[3];
981 c = (s2[4]<<24)|(s2[5]<<16)|(s2[0]<<8)|s2[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
982 __asm
984 mov edi, d2
985 pxor mm0, mm0
986 movd mm2, c
987 punpcklbw mm2, mm0
988 movd mm3, [edi]
989 punpcklbw mm3, mm0
990 movd mm4, ia
991 punpcklbw mm4, mm0
992 psraw mm4, 1 //or else, overflow because psraw shift in sign bit
993 pmullw mm3, mm4
994 psraw mm3, 7
995 paddsw mm3, mm2
996 packuswb mm3, mm3
997 movd [edi], mm3
1002 __asm emms;
1003 break;
1004 case MSP_YV12:
1005 case MSP_IYUV:
1007 //dst.pitch = abs(dst.pitch);
1008 int h2 = h/2;
1009 if(!dst.pitchUV)
1011 dst.pitchUV = abs(dst.pitch)/2;
1013 if(!dst.bitsU || !dst.bitsV)
1015 dst.bitsU = (BYTE*)dst.bits + abs(dst.pitch)*dst.h;
1016 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
1017 if(dst.type == MSP_YV12)
1019 BYTE* p = dst.bitsU;
1020 dst.bitsU = dst.bitsV;
1021 dst.bitsV = p;
1024 BYTE* dd[2];
1025 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
1026 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
1027 if(rd.top > rd.bottom)
1029 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1030 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1031 dst.pitchUV = -dst.pitchUV;
1034 BYTE* src_origin= (BYTE*)src.bits + src.pitch*rs.top + rs.left;
1036 BYTE* ss[2];
1037 ss[0] = src_origin + src.pitch*src.h*2;//U
1038 ss[1] = src_origin + src.pitch*src.h*3;//V
1040 AlphaBltYv12Luma( d, dst.pitch, w, h, src_origin + src.pitch*src.h, src_origin, src.pitch );
1042 AlphaBltYv12Chroma( dd[0], dst.pitchUV, w, h2, ss[0], src_origin, src.pitch);
1043 AlphaBltYv12Chroma( dd[1], dst.pitchUV, w, h2, ss[1], src_origin, src.pitch);
1045 __asm emms;
1047 break;
1048 default:
1049 return E_NOTIMPL;
1050 break;
1053 //emmsÒª40¸öcpuÖÜÆÚ
1054 //__asm emms;
1055 return S_OK;
1058 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_P010(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1060 const SubPicDesc& src = m_spd;
1061 SubPicDesc dst = *pTarget; // copy, because we might modify it
1063 CRect rs(*pSrc), rd(*pDst);
1065 if(dst.h < 0) {
1066 dst.h = -dst.h;
1067 rd.bottom = dst.h - rd.bottom;
1068 rd.top = dst.h - rd.top;
1071 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1072 return E_INVALIDARG;
1075 int w = rs.Width(), h = rs.Height();
1078 BYTE* s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1079 BYTE* d = static_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1081 if(rd.top > rd.bottom) {
1082 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1084 dst.pitch = -dst.pitch;
1087 for(ptrdiff_t i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1089 BYTE* s2 = s;
1090 BYTE* s2end = s2 + w*4;
1091 WORD* d2 = reinterpret_cast<WORD*>(d);
1092 for(; s2 < s2end; s2 += 4, d2++)
1094 if(s2[3] < 0xff) {
1095 d2[0] = ((d2[0]*s2[3])>>8) + (s2[1]<<8);
1100 //UV
1101 int h2 = h/2;
1102 if(!dst.pitchUV)
1104 dst.pitchUV = abs(dst.pitch);
1106 if(!dst.bitsU || !dst.bitsV)
1108 dst.bitsU = static_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1109 dst.bitsV = dst.bitsU + 2;
1111 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
1112 if(rd.top > rd.bottom)
1114 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
1115 dst.pitchUV = -dst.pitchUV;
1118 s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1120 d = ddUV;
1121 int pitch = src.pitch;
1122 for(int j = 0; j < h2; j++, s += 2*src.pitch, d += dst.pitchUV )
1124 BYTE* s2 = s;
1125 WORD* d2=reinterpret_cast<WORD*>(d);
1126 WORD* d2_end = reinterpret_cast<WORD*>(d+2*w);
1127 for( ; d2<d2_end; s2+=8, d2+=2)
1129 unsigned int ia = (
1130 s2[3]+ s2[3+4]+
1131 s2[3+src.pitch]+s2[3+4+src.pitch]);
1132 if( ia!=0xFF*4 )
1134 d2[0] = (((d2[0])*ia)>>10) + ((s2[0] + s2[0+src.pitch])<<7);
1135 d2[1] = (((d2[1])*ia)>>10) + ((s2[4] + s2[4+src.pitch])<<7);
1140 return S_OK;
1143 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_Yv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1145 const SubPicDesc& src = m_spd;
1146 SubPicDesc dst = *pTarget; // copy, because we might modify it
1148 CRect rs(*pSrc), rd(*pDst);
1150 if(dst.h < 0) {
1151 dst.h = -dst.h;
1152 rd.bottom = dst.h - rd.bottom;
1153 rd.top = dst.h - rd.top;
1156 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1157 return E_INVALIDARG;
1160 int w = rs.Width(), h = rs.Height();
1162 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1163 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1165 if(rd.top > rd.bottom) {
1166 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1168 dst.pitch = -dst.pitch;
1171 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1172 BYTE* s2 = s;
1173 BYTE* s2end = s2 + w*4;
1174 BYTE* d2 = d;
1175 for(; s2 < s2end; s2 += 4, d2++) {
1176 if(s2[3] < 0xff) {
1177 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1181 dst.pitch = abs(dst.pitch);
1183 int h2 = h/2;
1185 if(!dst.pitchUV) {
1186 dst.pitchUV = dst.pitch/2;
1189 BYTE* ss[2];
1190 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1191 ss[1] = ss[0] + 4;
1193 if(!dst.bitsU || !dst.bitsV) {
1194 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1195 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
1197 if(dst.type == MSP_YV12) {
1198 BYTE* p = dst.bitsU;
1199 dst.bitsU = dst.bitsV;
1200 dst.bitsV = p;
1204 BYTE* dd[2];
1205 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
1206 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
1208 if(rd.top > rd.bottom) {
1209 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1210 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1211 dst.pitchUV = -dst.pitchUV;
1214 for(ptrdiff_t i = 0; i < 2; i++) {
1215 s = ss[i];
1216 d = dd[i];
1217 BYTE* is = ss[1-i];
1218 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, is += src.pitch*2) {
1219 BYTE* s2 = s;
1220 BYTE* s2end = s2 + w*4;
1221 BYTE* d2 = d;
1222 BYTE* is2 = is;
1223 for(; s2 < s2end; s2 += 8, d2++, is2 += 8) {
1224 unsigned int ia = (s2[3]+s2[3+src.pitch]+is2[3]+is2[3+src.pitch])>>2;
1225 if(ia < 0xff) {
1226 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1232 return S_OK;
1235 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_Nv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1237 const SubPicDesc& src = m_spd;
1238 SubPicDesc dst = *pTarget; // copy, because we might modify it
1240 CRect rs(*pSrc), rd(*pDst);
1242 if(dst.h < 0) {
1243 dst.h = -dst.h;
1244 rd.bottom = dst.h - rd.bottom;
1245 rd.top = dst.h - rd.top;
1248 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1249 return E_INVALIDARG;
1252 int w = rs.Width(), h = rs.Height();
1254 BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1255 BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1257 if(rd.top > rd.bottom) {
1258 d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1260 dst.pitch = -dst.pitch;
1263 for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1264 BYTE* s2 = s;
1265 BYTE* s2end = s2 + w*4;
1266 BYTE* d2 = d;
1267 for(; s2 < s2end; s2 += 4, d2++) {
1268 if(s2[3] < 0xff) {
1269 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1273 dst.pitch = abs(dst.pitch);
1275 int h2 = h/2;
1277 if(!dst.pitchUV) {
1278 dst.pitchUV = dst.pitch;
1281 BYTE* ss[2];
1282 ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1283 ss[1] = ss[0] + 4;
1285 if(!dst.bitsU || !dst.bitsV) {
1286 dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1287 dst.bitsV = dst.bitsU + 1;
1289 if(dst.type == MSP_NV21) {
1290 BYTE* p = dst.bitsU;
1291 dst.bitsU = dst.bitsV;
1292 dst.bitsV = p;
1296 BYTE* dd[2];
1297 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1298 dd[1] = dd[0]+1;
1300 if(rd.top > rd.bottom) {
1301 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1302 dd[1] = dd[0]+1;
1303 dst.pitchUV = -dst.pitchUV;
1306 for(ptrdiff_t i = 0; i < 2; i++) {
1307 s = ss[i];
1308 d = dd[i];
1309 BYTE* is = ss[1-i];
1310 for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, is += src.pitch*2) {
1311 BYTE* s2 = s;
1312 BYTE* s2end = s2 + w*4;
1313 BYTE* d2 = d;
1314 BYTE* is2 = is;
1315 for(; s2 < s2end; s2 += 8, d2+=2, is2 += 8) {
1316 unsigned int ia = (s2[3]+s2[3+src.pitch]+is2[3]+is2[3+src.pitch])>>2;
1317 if(ia < 0xff) {
1318 *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1324 return S_OK;
1327 STDMETHODIMP CMemSubPic::AlphaBltAnv12_P010( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1329 //fix me: check colorspace and log error
1330 const SubPicDesc& src = m_spd;
1331 SubPicDesc dst = *pTarget; // copy, because we might modify it
1333 CRect rs(*pSrc), rd(*pDst);
1334 if(dst.h < 0)
1336 dst.h = -dst.h;
1337 rd.bottom = dst.h - rd.bottom;
1338 rd.top = dst.h - rd.top;
1340 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1341 return E_INVALIDARG;
1343 int w = rs.Width(), h = rs.Height();
1344 bool bottom_down = rs.top > rd.bottom;
1346 BYTE* d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1347 if(bottom_down)
1349 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left*2;
1350 dst.pitch = -dst.pitch;
1353 //dst.pitch = abs(dst.pitch);
1354 int h2 = h/2;
1355 if(!dst.pitchUV)
1357 dst.pitchUV = abs(dst.pitch);
1359 dst.bitsU = reinterpret_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1360 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
1361 if(bottom_down)
1363 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
1364 dst.pitchUV = -dst.pitchUV;
1367 BYTE* src_origin= reinterpret_cast<BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1368 BYTE *s = src_origin;
1370 // equivalent:
1371 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
1372 // && (reinterpret_cast<intptr_t>(d2)&15)==0 )
1373 if( ((reinterpret_cast<intptr_t>(s) | static_cast<intptr_t>(src.pitch) |
1374 reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
1376 for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1378 BYTE* sa = s;
1379 BYTE* s2 = s + src.pitch*src.h;
1380 BYTE* s2end_mod16 = s2 + (w&~15);
1381 BYTE* s2end = s2 + w;
1382 BYTE* d2 = d;
1384 for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=32)
1386 mix_16_y_p010_sse2(d2, s2, sa);
1388 for( WORD* d3=reinterpret_cast<WORD*>(d2); s2 < s2end; s2++, sa++, d3++)
1390 if(sa[0] < 0xff)
1392 d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
1397 else //fix me: only a workaround for non-mod-16 size video
1399 for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1401 BYTE* sa = s;
1402 BYTE* s2 = s + src.pitch*src.h;
1403 BYTE* s2end_mod16 = s2 + (w&~15);
1404 BYTE* s2end = s2 + w;
1405 WORD* d2 = reinterpret_cast<WORD*>(d);
1406 for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
1408 if(sa[0] < 0xff)
1410 d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
1416 d = ddUV;
1417 BYTE* sa = src_origin;
1418 BYTE* s_uv = src_origin + src.pitch*src.h*2;//UV
1419 if( ((reinterpret_cast<intptr_t>(sa) | static_cast<intptr_t>(src.pitch) |
1420 reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
1422 for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1424 BYTE* s_u2 = s_uv;
1425 BYTE* sa2 = sa;
1426 BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1427 BYTE* s_u2end = s_u2 + w;
1428 BYTE* d2 = d;
1430 for(; s_u2 < s_u2end_mod16; s_u2+=16, sa2+=16, d2+=32)
1432 mix_16_uv_p010_sse2(d2, s_u2, sa2, src.pitch);
1435 for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1437 unsigned int ia = (
1438 sa2[0]+ sa2[1]+
1439 sa2[0+src.pitch]+sa2[1+src.pitch]);
1440 if( ia!=0xFF*4 )
1442 d3[0] = (((d3[0])*ia)>>10) + (s_u2[0]<<8);
1443 d3[1] = (((d3[1])*ia)>>10) + (s_u2[1]<<8);
1448 else
1450 for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1452 BYTE* s_u2 = s_uv;
1453 BYTE* sa2 = sa;
1454 BYTE* s_u2end = s_u2 + w;
1455 BYTE* d2 = d;
1457 for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1459 unsigned int ia = (
1460 sa2[0]+ sa2[1]+
1461 sa2[0+src.pitch]+sa2[1+src.pitch]);
1462 if( ia!=0xFF*4 )
1464 d3[0] = (((d3[0])*ia)>>10) + (s_u2[0]<<8);
1465 d3[1] = (((d3[1])*ia)>>10) + (s_u2[1]<<8);
1470 __asm emms;
1473 STDMETHODIMP CMemSubPic::AlphaBltAnv12_Nvxx( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1475 //fix me: check colorspace and log error
1476 const SubPicDesc& src = m_spd;
1477 SubPicDesc dst = *pTarget; // copy, because we might modify it
1479 CRect rs(*pSrc), rd(*pDst);
1480 if(dst.h < 0)
1482 dst.h = -dst.h;
1483 rd.bottom = dst.h - rd.bottom;
1484 rd.top = dst.h - rd.top;
1486 if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1487 return E_INVALIDARG;
1489 int w = rs.Width(), h = rs.Height();
1490 bool bottom_down = rs.top > rd.bottom;
1492 BYTE* d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left;
1493 if(bottom_down)
1495 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left;
1496 dst.pitch = -dst.pitch;
1499 //dst.pitch = abs(dst.pitch);
1500 int h2 = h/2;
1501 if(!dst.pitchUV)
1503 dst.pitchUV = abs(dst.pitch);
1505 if(!dst.bitsU)
1507 dst.bitsU = reinterpret_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1509 BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1510 if(bottom_down)
1512 ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1513 dst.pitchUV = -dst.pitchUV;
1516 BYTE* sa= reinterpret_cast<BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1518 BYTE* s_uv = sa + src.pitch*src.h*2;//UV
1520 AlphaBltYv12Luma( d, dst.pitch, w, h, sa + src.pitch*src.h, sa, src.pitch );
1521 if( ((reinterpret_cast<intptr_t>(sa) | static_cast<intptr_t>(src.pitch) |
1522 reinterpret_cast<intptr_t>(ddUV) | static_cast<intptr_t>(dst.pitchUV) ) & 15 )==0 )
1524 BYTE* d = ddUV;
1525 int pitch = src.pitch;
1526 for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1528 BYTE* s_u2 = s_uv;
1529 BYTE* sa2 = sa;
1530 BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1531 BYTE* s_u2end = s_u2 + w;
1532 BYTE* d2 = d;
1534 for(; s_u2 < s_u2end_mod16; s_u2+=16, sa2+=16, d2+=16)
1536 mix_16_uv_nvxx_sse2(d2, s_u2, sa2, src.pitch);
1538 for( BYTE* d3=d2; s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1540 unsigned int ia = (
1541 sa2[0]+ sa2[1]+
1542 sa2[0+src.pitch]+sa2[1+src.pitch]);
1543 if( ia!=0xFF*4 )
1545 d3[0] = (((d3[0])*ia)>>10) + s_u2[0];
1546 d3[1] = (((d3[1])*ia)>>10) + s_u2[1];
1551 else
1553 BYTE* d = ddUV;
1554 int pitch = src.pitch;
1555 for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1557 BYTE* s_u2 = s_uv;
1558 BYTE* sa2 = sa;
1559 BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1560 BYTE* s_u2end = s_u2 + w;
1561 BYTE* d2 = d;
1563 for( BYTE* d3=d2; s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1565 unsigned int ia = (
1566 sa2[0]+ sa2[1]+
1567 sa2[0+src.pitch]+sa2[1+src.pitch]);
1568 if( ia!=0xFF*4 )
1570 d3[0] = (((d3[0])*ia)>>10) + s_u2[0];
1571 d3[1] = (((d3[1])*ia)>>10) + s_u2[1];
1577 __asm emms;
1580 STDMETHODIMP CMemSubPic::SetDirtyRectEx(CAtlList<CRect>* dirtyRectList )
1582 //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1583 if(dirtyRectList!=NULL)
1585 POSITION pos = dirtyRectList->GetHeadPosition();
1586 if(m_spd.type == MSP_AYUV_PLANAR || m_alpha_blt_dst_type==MSP_IYUV || m_alpha_blt_dst_type==MSP_YV12
1587 || m_alpha_blt_dst_type==MSP_P010 || m_alpha_blt_dst_type==MSP_P016
1588 || m_alpha_blt_dst_type==MSP_NV12 || m_alpha_blt_dst_type==MSP_NV21 )
1590 while(pos!=NULL)
1592 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1593 cRectSrc.left &= ~15;
1594 cRectSrc.right = (cRectSrc.right+15)&~15;
1595 cRectSrc.top &= ~1;
1596 cRectSrc.bottom = (cRectSrc.bottom+1)&~1;
1599 else if(m_spd.type == MSP_XY_AUYV || m_alpha_blt_dst_type==MSP_YUY2)
1601 while(pos!=NULL)
1603 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1604 cRectSrc.left &= ~3;
1605 cRectSrc.right = (cRectSrc.right+3)&~3;
1609 return __super::SetDirtyRectEx(dirtyRectList);
1613 // CMemSubPicAllocator
1616 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type, SIZE maxsize, int type/*=-1*/)
1617 : CSubPicExAllocatorImpl(maxsize, false, false)
1618 , m_alpha_blt_dst_type(alpha_blt_dst_type)
1619 , m_maxsize(maxsize)
1620 , m_type(type)
1622 if(m_type==-1)
1624 switch(alpha_blt_dst_type)
1626 case MSP_YUY2:
1627 m_type = MSP_XY_AUYV;
1628 break;
1629 case MSP_AYUV:
1630 m_type = MSP_AYUV;
1631 break;
1632 case MSP_IYUV:
1633 case MSP_YV12:
1634 case MSP_P010:
1635 case MSP_P016:
1636 case MSP_NV12:
1637 case MSP_NV21:
1638 m_type = MSP_AYUV_PLANAR;
1639 break;
1640 default:
1641 m_type = MSP_RGBA;
1642 break;
1647 // ISubPicAllocatorImpl
1649 bool CMemSubPicAllocator::AllocEx(bool fStatic, ISubPicEx** ppSubPic)
1651 if(!ppSubPic) {
1652 return false;
1654 SubPicDesc spd;
1655 spd.w = m_maxsize.cx;
1656 spd.h = m_maxsize.cy;
1657 spd.bpp = 32;
1658 spd.pitch = (spd.w*spd.bpp)>>3;
1659 spd.type = m_type;
1660 spd.bits = DNew BYTE[spd.pitch*spd.h];
1661 if(!spd.bits) {
1662 return false;
1664 *ppSubPic = DNew CMemSubPic(spd, m_alpha_blt_dst_type);
1665 if(!(*ppSubPic)) {
1666 return false;
1668 (*ppSubPic)->AddRef();
1669 return true;