2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
23 #include "MemSubPic.h"
24 #include "color_conv_table.h"
26 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
27 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
28 m128_2 = _mm_slli_epi16(m128_1, 8); \
29 m128_1 = _mm_srli_epi16(m128_1, 8); \
30 m128_2 = _mm_srli_epi16(m128_2, 8); \
31 m128_1 = _mm_avg_epu8(m128_1, m128_2);
33 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
35 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
36 m128_2 = _mm_slli_epi16(m128_1, 8); \
37 __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
38 m128_2 = _mm_or_si128(m128_2, m128_3);\
39 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
42 void subsample_and_interlace_2_line_c(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
)
44 const BYTE
* end
= u
+ w
;
45 for (;u
<end
;dst
+=2,u
+=2,v
+=2)
47 dst
[0] = (u
[0] + u
[0+pitch
] + 1)/2;
48 int tmp1
= (u
[1] + u
[1+pitch
] + 1)/2;
49 dst
[0] = (dst
[0] + tmp1
+ 1)/2;
50 dst
[1] = (v
[0] + v
[0+pitch
] + 1)/2;
51 tmp1
= (v
[1] + v
[1+pitch
] + 1)/2;
52 dst
[1] = (dst
[1] + tmp1
+ 1)/2;
56 __forceinline
void subsample_and_interlace_2_line_sse2(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
)
58 const BYTE
* end
= u
+ w
;
59 for (;u
<end
;dst
+=16,u
+=16,v
+=16)
61 __m128i u_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
) );
62 __m128i u_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
+pitch
) );
63 __m128i v_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
) );
64 __m128i v_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
+pitch
) );
65 AVERAGE_4_PIX_INTRINSICS(u_1
, u_2
);
66 AVERAGE_4_PIX_INTRINSICS(v_1
, v_2
);
67 u_1
= _mm_packus_epi16(u_1
, u_1
);
68 v_1
= _mm_packus_epi16(v_1
, v_1
);
69 u_1
= _mm_unpacklo_epi8(u_1
, v_1
);
71 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), u_1
);
75 static __forceinline
void pix_alpha_blend_yv12_luma_sse2(byte
* dst
, const byte
* alpha
, const byte
* sub
)
77 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
78 __m128i alpha128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
) );
79 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(sub
) );
80 __m128i zero
= _mm_setzero_si128();
82 __m128i ones
= _mm_cmpeq_epi32(ones
,ones
);
83 ones
= _mm_cmpeq_epi8(ones
,alpha128
);
85 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
86 __m128i alpha_lo128
= _mm_unpacklo_epi8(alpha128
, zero
);
88 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
90 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha_lo128
);
91 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
92 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
94 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
95 alpha128
= _mm_unpackhi_epi8(alpha128
, zero
);
97 ones2
= _mm_unpackhi_epi8(ones
, zero
);
99 dst128
= _mm_mullo_epi16(dst128
, alpha128
);
100 dst128
= _mm_adds_epu16(dst128
, ones2
);
101 dst128
= _mm_srli_epi16(dst128
, 8);
102 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
104 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
105 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
109 * output not exactly identical to pix_alpha_blend_yv12_chroma
111 static __forceinline
void pix_alpha_blend_yv12_chroma_sse2(byte
* dst
, const byte
* src
, const byte
* alpha
, int src_pitch
)
113 __m128i zero
= _mm_setzero_si128();
114 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
) );
115 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
+src_pitch
) );
116 __m128i dst128
= _mm_loadl_epi64( reinterpret_cast<const __m128i
*>(dst
) );
118 __m128i sub128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
119 __m128i sub128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
+src_pitch
) );
121 AVERAGE_4_PIX_INTRINSICS(alpha128_1
, alpha128_2
);
123 __m128i ones
= _mm_cmpeq_epi32(ones
, ones
);
124 ones
= _mm_cmpeq_epi8(ones
, alpha128_1
);
126 dst128
= _mm_unpacklo_epi8(dst128
, zero
);
127 __m128i dst128_2
= _mm_and_si128(dst128
, ones
);
129 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
130 dst128
= _mm_adds_epu16(dst128
, dst128_2
);
132 dst128
= _mm_srli_epi16(dst128
, 8);
134 AVERAGE_4_PIX_INTRINSICS(sub128_1
, sub128_2
);
136 dst128
= _mm_adds_epi16(dst128
, sub128_1
);
137 dst128
= _mm_packus_epi16(dst128
, dst128
);
139 _mm_storel_epi64( reinterpret_cast<__m128i
*>(dst
), dst128
);
142 static __forceinline
void pix_alpha_blend_yv12_chroma(byte
* dst
, const byte
* src
, const byte
* alpha
, int src_pitch
)
144 unsigned int ia
= (alpha
[0]+alpha
[1]+
145 alpha
[0+src_pitch
]+alpha
[1+src_pitch
])>>2;
148 *dst
= (((*dst
)*ia
)>>8) + ((src
[0] +src
[1]+
149 src
[src_pitch
]+src
[1+src_pitch
] )>>2);
153 static void AlphaBltYv12Luma(byte
* dst
, int dst_pitch
,
155 const byte
* sub
, const byte
* alpha
, int sub_pitch
)
157 if( ((reinterpret_cast<intptr_t>(alpha
) | static_cast<intptr_t>(sub_pitch
) |
158 reinterpret_cast<intptr_t>(dst
) | static_cast<intptr_t>(dst_pitch
) ) & 15 )==0 )
160 for(int i
=0; i
<h
; i
++, dst
+= dst_pitch
, alpha
+= sub_pitch
, sub
+= sub_pitch
)
162 const BYTE
* sa
= alpha
;
163 const BYTE
* s2
= sub
;
164 const BYTE
* s2end_mod16
= s2
+ (w
&~15);
165 const BYTE
* s2end
= s2
+ w
;
168 for(; s2
< s2end_mod16
; s2
+=16, sa
+=16, d2
+=16)
170 pix_alpha_blend_yv12_luma_sse2(d2
, sa
, s2
);
172 for(; s2
< s2end
; s2
++, sa
++, d2
++)
176 d2
[0] = ((d2
[0]*sa
[0])>>8) + s2
[0];
181 else //fix me: only a workaround for non-mod-16 size video
183 for(int i
=0; i
<h
; i
++, dst
+= dst_pitch
, alpha
+= sub_pitch
, sub
+= sub_pitch
)
185 const BYTE
* sa
= alpha
;
186 const BYTE
* s2
= sub
;
187 const BYTE
* s2end_mod16
= s2
+ (w
&~15);
188 const BYTE
* s2end
= s2
+ w
;
190 for(; s2
< s2end
; s2
+=1, sa
+=1, d2
+=1)
194 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
195 d2
[0] = ((d2
[0]*sa
[0])>>8) + s2
[0];
202 static void AlphaBltYv12Chroma(byte
* dst
, int dst_pitch
,
204 const byte
* sub_chroma
, const byte
* alpha
, int sub_pitch
)
206 if( ((reinterpret_cast<intptr_t>(sub_chroma
) |
207 //reinterpret_cast<intptr_t>(dst) |
208 reinterpret_cast<intptr_t>(alpha
) | static_cast<intptr_t>(sub_pitch
)
209 //| (static_cast<intptr_t>(dst_pitch)&7)
212 int pitch
= sub_pitch
;
213 for(int j
= 0; j
< chroma_h
; j
++, sub_chroma
+= sub_pitch
*2, alpha
+= sub_pitch
*2, dst
+= dst_pitch
)
215 const BYTE
* s2
= sub_chroma
;
216 const BYTE
* sa2
= alpha
;
217 const BYTE
* s2end_mod16
= s2
+ (w
&~15);
218 const BYTE
* s2end
= s2
+ w
;
221 for(; s2
< s2end_mod16
; s2
+= 16, sa2
+= 16, d2
+=8)
223 pix_alpha_blend_yv12_chroma_sse2(d2
, s2
, sa2
, sub_pitch
);
225 for(; s2
< s2end
; s2
+=2, sa2
+=2, d2
++)
227 pix_alpha_blend_yv12_chroma(d2
, s2
, sa2
, sub_pitch
);
231 else//fix me: only a workaround for non-mod-16 size video
233 for(int j
= 0; j
< chroma_h
; j
++, sub_chroma
+= sub_pitch
*2, alpha
+= sub_pitch
*2, dst
+= dst_pitch
)
235 const BYTE
* s2
= sub_chroma
;
236 const BYTE
* sa2
= alpha
;
237 const BYTE
* s2end_mod16
= s2
+ (w
&~15);
238 const BYTE
* s2end
= s2
+ w
;
240 for(; s2
< s2end
; s2
+= 2, sa2
+= 2, d2
++)
242 pix_alpha_blend_yv12_chroma(d2
, s2
, sa2
, sub_pitch
);
248 __forceinline
void mix_16_y_p010_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
)
251 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
252 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
253 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
255 __m128i alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
256 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
258 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
259 //so we do it another way
260 //first, (alpha<<8)+0xff
261 __m128i ones
= _mm_setzero_si128();
262 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
264 __m128i ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
265 ones
= _mm_xor_si128(ones
, ones2
);
266 ones
= _mm_srli_epi16(ones
, 15);
267 ones
= _mm_and_si128(ones
, lo
);
269 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
270 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
272 lo
= _mm_setzero_si128();
273 lo
= _mm_unpacklo_epi8(lo
, src_y
);
274 dst_y
= _mm_adds_epu16(dst_y
, lo
);
275 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
278 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
280 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
282 ones
= _mm_setzero_si128();
283 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
284 ones
= _mm_xor_si128(ones
, ones2
);
285 ones
= _mm_srli_epi16(ones
, 15);
286 ones
= _mm_and_si128(ones
, lo
);
288 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
289 dst_y
= _mm_adds_epu16(dst_y
, ones
);
291 lo
= _mm_setzero_si128();
292 lo
= _mm_unpackhi_epi8(lo
, src_y
);
293 dst_y
= _mm_adds_epu16(dst_y
, lo
);
294 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
298 void mix_16_y_p010_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
)
300 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
301 for (int i
=0;i
<16;i
++)
303 if (src_alpha
[i
]!=0xff)
305 dst_word
[i
] = ((dst_word
[i
] *src_alpha
[i
])>>8) + (src
[i
]<<8);
310 __forceinline
void mix_16_uv_p010_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
313 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
314 __m128i alpha2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
+pitch
) );
316 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
317 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
319 AVERAGE_4_PIX_INTRINSICS_2(alpha
, alpha2
);
321 __m128i alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
322 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
324 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
325 //so we do it another way
326 //first, (alpha<<8)+0xff
327 __m128i ones
= _mm_setzero_si128();
328 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
330 __m128i ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
331 ones
= _mm_xor_si128(ones
, ones2
);
332 ones
= _mm_srli_epi16(ones
, 15);
333 ones
= _mm_and_si128(ones
, lo
);
335 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
336 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
338 lo
= _mm_setzero_si128();
339 lo
= _mm_unpacklo_epi8(lo
, src_y
);
340 dst_y
= _mm_adds_epu16(dst_y
, lo
);
341 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
344 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
346 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
348 ones
= _mm_setzero_si128();
349 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
350 ones
= _mm_xor_si128(ones
, ones2
);
351 ones
= _mm_srli_epi16(ones
, 15);
352 ones
= _mm_and_si128(ones
, lo
);
354 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
355 dst_y
= _mm_adds_epu16(dst_y
, ones
);
357 lo
= _mm_setzero_si128();
358 lo
= _mm_unpackhi_epi8(lo
, src_y
);
359 dst_y
= _mm_adds_epu16(dst_y
, lo
);
360 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
364 void mix_16_uv_p010_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
366 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
367 for (int i
=0;i
<8;i
++, src_alpha
+=2, src
+=2, dst_word
+=2)
370 (src_alpha
[0]+src_alpha
[0+pitch
]+1)/2+
371 (src_alpha
[1]+src_alpha
[1+pitch
]+1)/2+1)/2;
374 int tmp
= (((dst_word
[0])*ia
)>>8) + (src
[0]<<8);
375 if(tmp
>0xffff) tmp
= 0xffff;
377 tmp
= (((dst_word
[1])*ia
)>>8) + (src
[1]<<8);
378 if(tmp
>0xffff) tmp
= 0xffff;
384 __forceinline
void mix_16_uv_nvxx_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
386 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
387 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
388 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
+pitch
) );
389 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
391 AVERAGE_4_PIX_INTRINSICS_2(alpha128_1
, alpha128_2
);
392 __m128i zero
= _mm_setzero_si128();
394 __m128i ones
= _mm_cmpeq_epi32(ones
,ones
);
395 ones
= _mm_cmpeq_epi8(ones
,alpha128_1
);
397 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
398 alpha128_2
= _mm_unpacklo_epi8(alpha128_1
, zero
);
400 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
402 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha128_2
);
403 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
404 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
406 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
407 alpha128_1
= _mm_unpackhi_epi8(alpha128_1
, zero
);
409 ones2
= _mm_unpackhi_epi8(ones
, zero
);
411 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
412 dst128
= _mm_adds_epu16(dst128
, ones2
);
413 dst128
= _mm_srli_epi16(dst128
, 8);
414 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
416 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
417 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
421 void mix_16_uv_nvxx_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
423 for (int i
=0;i
<8;i
++, src_alpha
+=2, src
+=2, dst
+=2)
426 (src_alpha
[0]+src_alpha
[0+pitch
]+1)/2+
427 (src_alpha
[1]+src_alpha
[1+pitch
]+1)/2+1)/2;
430 dst
[0] = (((dst
[0])*ia
)>>8) + src
[0];
431 dst
[1] = (((dst
[1])*ia
)>>8) + src
[1];
440 CMemSubPic::CMemSubPic(SubPicDesc
& spd
, int alpha_blt_dst_type
)
441 : m_spd(spd
), m_alpha_blt_dst_type(alpha_blt_dst_type
)
443 m_maxsize
.SetSize(spd
.w
, spd
.h
);
444 // m_rcDirty.SetRect(0, 0, spd.w, spd.h);
445 CRect
allSpd(0,0,spd
.w
, spd
.h
);
446 m_rectListDirty
.AddTail(allSpd
);
449 CMemSubPic::~CMemSubPic()
451 delete [] m_spd
.bits
, m_spd
.bits
= NULL
;
456 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
458 return (void*)&m_spd
;
461 STDMETHODIMP
CMemSubPic::GetDesc(SubPicDesc
& spd
) const
463 spd
.type
= m_spd
.type
;
467 spd
.pitch
= m_spd
.pitch
;
468 spd
.bits
= m_spd
.bits
;
469 spd
.bitsU
= m_spd
.bitsU
;
470 spd
.bitsV
= m_spd
.bitsV
;
471 spd
.vidrect
= m_vidrect
;
475 STDMETHODIMP
CMemSubPic::CopyTo(ISubPicEx
* pSubPic
)
478 if(FAILED(hr
= __super::CopyTo(pSubPic
))) {
483 if(FAILED(GetDesc(src
)) || FAILED(pSubPic
->GetDesc(dst
))) {
486 while(!m_rectListDirty
.IsEmpty())
488 CRect
& cRect
= m_rectListDirty
.GetHead();
489 int w
= cRect
.Width(), h
= cRect
.Height();
490 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*cRect
.top
+ cRect
.left
*4;
491 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*cRect
.top
+ cRect
.left
*4;
492 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
498 STDMETHODIMP
CMemSubPic::ClearDirtyRect(DWORD color
)
500 if(m_rectListDirty
.IsEmpty()) {
503 while(!m_rectListDirty
.IsEmpty())
505 //pDirtyRect = m_rectListDirty.RemoveHead();
506 CRect
& dirtyRect
= m_rectListDirty
.RemoveTail();
507 BYTE
* p
= (BYTE
*)m_spd
.bits
+ m_spd
.pitch
*(dirtyRect
.top
) + dirtyRect
.left
*(m_spd
.bpp
>>3);
508 int w
= dirtyRect
.Width();
509 if(m_spd
.type
!=MSP_AYUV_PLANAR
)
511 for(int j
= 0, h
= dirtyRect
.Height(); j
< h
; j
++, p
+= m_spd
.pitch
)
514 memsetd(p
, color
, w
*4); // nya
532 for(int j
= 0, h
= dirtyRect
.Height(); j
< h
; j
++, p
+= m_spd
.pitch
)
534 // memsetd(p, 0, m_rcDirty.Width());
535 //DbgLog((LOG_TRACE, 3, "w:%d", w));
536 //w = pDirtyRect->Width();
538 memset(p
+m_spd
.h
*m_spd
.pitch
, 0, w
);
539 memset(p
+m_spd
.h
*m_spd
.pitch
*2, 0, w
);
540 memset(p
+m_spd
.h
*m_spd
.pitch
*3, 0, w
);
544 m_rectListDirty
.RemoveAll();
548 STDMETHODIMP
CMemSubPic::Lock(SubPicDesc
& spd
)
553 STDMETHODIMP
CMemSubPic::Unlock( CAtlList
<CRect
>* dirtyRectList
)
555 int src_type
= m_spd
.type
;
556 int dst_type
= m_alpha_blt_dst_type
;
557 if( (src_type
==MSP_RGBA
&& (dst_type
== MSP_RGB32
||
558 dst_type
== MSP_RGB24
||
559 dst_type
== MSP_RGB16
||
560 dst_type
== MSP_RGB15
))
562 (src_type
==MSP_XY_AUYV
&& dst_type
== MSP_YUY2
)//ToDo: fix me MSP_AYUV
564 (src_type
==MSP_AYUV
&& dst_type
== MSP_AYUV
)
566 (src_type
==MSP_AYUV_PLANAR
&& (dst_type
== MSP_IYUV
||
567 dst_type
== MSP_YV12
||
568 dst_type
== MSP_P010
||
569 dst_type
== MSP_P016
||
570 dst_type
== MSP_NV12
||
571 dst_type
== MSP_NV21
)))
573 return UnlockOther(dirtyRectList
);
575 else if(src_type
==MSP_RGBA
&& (dst_type
== MSP_YUY2
||
576 dst_type
== MSP_AYUV
|| //ToDo: fix me MSP_AYUV
577 dst_type
== MSP_IYUV
||
578 dst_type
== MSP_YV12
||
579 dst_type
== MSP_NV12
||
580 dst_type
== MSP_NV21
||
581 dst_type
== MSP_P010
||
582 dst_type
== MSP_P016
))
584 return UnlockRGBA_YUV(dirtyRectList
);
589 STDMETHODIMP
CMemSubPic::UnlockOther(CAtlList
<CRect
>* dirtyRectList
)
591 SetDirtyRectEx(dirtyRectList
);
592 if(m_rectListDirty
.IsEmpty()) {
596 POSITION pos
= m_rectListDirty
.GetHeadPosition();
599 const CRect
& cRect
= m_rectListDirty
.GetNext(pos
);
600 int w
= cRect
.Width(), h
= cRect
.Height();
601 BYTE
* top
= (BYTE
*)m_spd
.bits
+ m_spd
.pitch
*(cRect
.top
) + cRect
.left
*4;
602 BYTE
* bottom
= top
+ m_spd
.pitch
*h
;
603 if(m_alpha_blt_dst_type
== MSP_RGB16
)
605 for(; top
< bottom
; top
+= m_spd
.pitch
)
607 DWORD
* s
= (DWORD
*)top
;
611 *s
= ((*s
>>3)&0x1f000000)|((*s
>>8)&0xf800)|((*s
>>5)&0x07e0)|((*s
>>3)&0x001f);
612 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
616 else if(m_alpha_blt_dst_type
== MSP_RGB15
)
618 for(; top
< bottom
; top
+= m_spd
.pitch
)
620 DWORD
* s
= (DWORD
*)top
;
624 *s
= ((*s
>>3)&0x1f000000)|((*s
>>9)&0x7c00)|((*s
>>6)&0x03e0)|((*s
>>3)&0x001f);
625 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
629 else if(m_alpha_blt_dst_type
== MSP_YUY2
)
631 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top
, m_spd
.pitch
*(h
-1)) );
633 for(BYTE
* tempTop
=top
; tempTop
< bottom
; tempTop
+= m_spd
.pitch
)
637 for(; s
< e
; s
+=8) // AUYV AUYV -> AxYU AxYV
639 s
[4] = (s
[0] + s
[4])>>1;
640 s
[0] = (s
[2] + s
[6])>>1;
644 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top
, m_spd
.pitch
*(h
-1)) );
646 else if(m_alpha_blt_dst_type
== MSP_YV12
|| m_alpha_blt_dst_type
== MSP_IYUV
|| m_alpha_blt_dst_type
== MSP_YV12
)
650 else if ( m_alpha_blt_dst_type
== MSP_P010
|| m_alpha_blt_dst_type
== MSP_P016
651 || m_alpha_blt_dst_type
== MSP_NV12
)
653 SubsampleAndInterlace(cRect
, true);
655 else if( m_alpha_blt_dst_type
== MSP_NV21
)
657 SubsampleAndInterlace(cRect
, false);
663 STDMETHODIMP
CMemSubPic::UnlockRGBA_YUV(CAtlList
<CRect
>* dirtyRectList
)
665 SetDirtyRectEx(dirtyRectList
);
666 if(m_rectListDirty
.IsEmpty()) {
670 const ColorConvTable
*conv_table
= ColorConvTable::GetDefaultColorConvTable();
671 const int *c2y_yb
= conv_table
->c2y_yb
;
672 const int *c2y_yg
= conv_table
->c2y_yg
;
673 const int *c2y_yr
= conv_table
->c2y_yr
;
674 const int cy_cy2
= conv_table
->cy_cy2
;
675 const int c2y_cu
= conv_table
->c2y_cu
;
676 const int c2y_cv
= conv_table
->c2y_cv
;
677 const int cy_cy
= conv_table
->cy_cy
;
678 const unsigned char* Clip
= conv_table
->Clip
;
680 POSITION pos
= m_rectListDirty
.GetHeadPosition();
683 const CRect
& cRect
= m_rectListDirty
.GetNext(pos
);
684 int w
= cRect
.Width(), h
= cRect
.Height();
686 BYTE
* top
= (BYTE
*)m_spd
.bits
+ m_spd
.pitch
*cRect
.top
+ cRect
.left
*4;
687 BYTE
* bottom
= top
+ m_spd
.pitch
*h
;
689 if( m_alpha_blt_dst_type
== MSP_YUY2
||
690 m_alpha_blt_dst_type
== MSP_YV12
||
691 m_alpha_blt_dst_type
== MSP_IYUV
||
692 m_alpha_blt_dst_type
== MSP_P010
||
693 m_alpha_blt_dst_type
== MSP_P016
||
694 m_alpha_blt_dst_type
== MSP_NV12
||
695 m_alpha_blt_dst_type
== MSP_NV21
) {
696 for(; top
< bottom
; top
+= m_spd
.pitch
) {
699 for(; s
< e
; s
+=8) { // ARGB ARGB -> AxYU AxYV
700 if((s
[3]+s
[7]) < 0x1fe) {
701 int a
= 0x200 - (s
[3]+s
[7]);
704 s
[1] = (c2y_yb
[s
[0]] + c2y_yg
[s
[1]] + c2y_yr
[s
[2]] + 0x10*a
+ 0x8000) >> 16;
705 s
[5] = (c2y_yb
[s
[4]] + c2y_yg
[s
[5]] + c2y_yr
[s
[6]] + 0x10*a
+ 0x8000) >> 16;
707 int scaled_y
= (s
[1]+s
[5]-32) * cy_cy2
;
709 s
[0] = Clip
[(((((s
[0]+s
[4])<<15) - scaled_y
) >> 10) * c2y_cu
+ 0x80*a
+ 0x8000) >> 16];
710 s
[4] = Clip
[(((((s
[2]+s
[6])<<15) - scaled_y
) >> 10) * c2y_cv
+ 0x80*a
+ 0x8000) >> 16];
718 else if(m_alpha_blt_dst_type
== MSP_AYUV
) {
719 for(; top
< bottom
; top
+= m_spd
.pitch
) {
722 for(; s
< e
; s
+=4) { // ARGB -> AYUV
724 int a
= 0x100 - s
[3];
728 int y
= (c2y_yb
[s
[0]] + c2y_yg
[s
[1]] + c2y_yr
[s
[2]] + 0x10*a
+ 0x8000) >> 16;
729 int scaled_y
= (y
-32) * cy_cy
;
730 s
[1] = Clip
[((((s
[0]<<16) - scaled_y
) >> 10) * c2y_cu
+ 0x80*a
+ 0x8000) >> 16];
731 s
[0] = Clip
[((((s
[2]<<16) - scaled_y
) >> 10) * c2y_cv
+ 0x80*a
+ 0x8000) >> 16];
744 void CMemSubPic::SubsampleAndInterlace( const CRect
& cRect
, bool u_first
)
746 //fix me: check alignment and log error
747 int w
= cRect
.Width(), h
= cRect
.Height();
748 BYTE
* u_plan
= reinterpret_cast<BYTE
*>(m_spd
.bits
) + m_spd
.pitch
*m_spd
.h
*2;
749 BYTE
* u_start
= u_plan
+ m_spd
.pitch
*(cRect
.top
)+ cRect
.left
;
750 BYTE
* v_start
= u_start
+ m_spd
.pitch
*m_spd
.h
;
758 for (int i
=0;i
<h
;i
+=2)
760 subsample_and_interlace_2_line_sse2(dst
, u_start
, v_start
, w
, m_spd
.pitch
);
761 u_start
+= 2*m_spd
.pitch
;
762 v_start
+= 2*m_spd
.pitch
;
767 STDMETHODIMP
CMemSubPic::AlphaBlt( const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
769 if(!pSrc
|| !pDst
|| !pTarget
) {
772 int src_type
= m_spd
.type
;
773 int dst_type
= pTarget
->type
;
775 if( (src_type
==MSP_RGBA
&& (dst_type
== MSP_RGB32
||
776 dst_type
== MSP_RGB24
||
777 dst_type
== MSP_RGB16
||
778 dst_type
== MSP_RGB15
||
779 dst_type
== MSP_RGBA
||
780 dst_type
== MSP_YUY2
||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
781 dst_type
== MSP_AYUV
))
783 (src_type
==MSP_XY_AUYV
&& dst_type
== MSP_YUY2
)//ToDo: fix me MSP_AYUV
785 (src_type
==MSP_AYUV
&& dst_type
== MSP_AYUV
)
787 (src_type
==MSP_AYUV_PLANAR
&& (dst_type
== MSP_IYUV
||
788 dst_type
== MSP_YV12
)) )
790 return AlphaBltOther(pSrc
, pDst
, pTarget
);
792 else if ( src_type
==MSP_AYUV_PLANAR
&& (dst_type
== MSP_NV12
||
793 dst_type
== MSP_NV21
) )
795 return AlphaBltAnv12_Nvxx(pSrc
, pDst
, pTarget
);
798 else if( src_type
==MSP_AYUV_PLANAR
&& (dst_type
== MSP_P010
||
799 dst_type
== MSP_P016
) )
801 return AlphaBltAnv12_P010(pSrc
, pDst
, pTarget
);
803 else if( src_type
==MSP_RGBA
&& (dst_type
== MSP_IYUV
||
804 dst_type
== MSP_YV12
))
806 return AlphaBltAxyuAxyv_Yv12(pSrc
, pDst
, pTarget
);
808 else if( src_type
==MSP_RGBA
&& (dst_type
== MSP_NV12
||
809 dst_type
== MSP_NV21
))
811 return AlphaBltAxyuAxyv_Nv12(pSrc
, pDst
, pTarget
);
813 else if( src_type
==MSP_RGBA
&& (dst_type
== MSP_P010
||
814 dst_type
== MSP_P016
))
816 return AlphaBltAxyuAxyv_P010(pSrc
, pDst
, pTarget
);
821 STDMETHODIMP
CMemSubPic::AlphaBltOther(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
823 const SubPicDesc
& src
= m_spd
;
824 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
826 CRect
rs(*pSrc
), rd(*pDst
);
830 rd
.bottom
= dst
.h
- rd
.bottom
;
831 rd
.top
= dst
.h
- rd
.top
;
833 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
836 int w
= rs
.Width(), h
= rs
.Height();
837 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);//rs.left*4
838 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*rd
.top
+ ((rd
.left
*dst
.bpp
)>>3);
839 if(rd
.top
> rd
.bottom
)
841 if(dst
.type
== MSP_RGB32
|| dst
.type
== MSP_RGB24
842 || dst
.type
== MSP_RGB16
|| dst
.type
== MSP_RGB15
843 || dst
.type
== MSP_YUY2
|| dst
.type
== MSP_AYUV
)
845 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + (rd
.left
*dst
.bpp
>>3);
847 else if(dst
.type
== MSP_YV12
|| dst
.type
== MSP_IYUV
)
849 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + (rd
.left
*8>>3);
855 dst
.pitch
= -dst
.pitch
;
857 DbgLog((LOG_TRACE
, 5, TEXT("w=%d h=%d"), w
, h
));
861 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
864 BYTE
* s2end
= s2
+ w
*4;
865 DWORD
* d2
= (DWORD
*)d
;
866 for(; s2
< s2end
; s2
+= 4, d2
++)
870 DWORD bd
=0x00000100 -( (DWORD
) s2
[3]);
871 DWORD B
= ((*((DWORD
*)s2
)&0x000000ff)<<8)/bd
;
872 DWORD V
= ((*((DWORD
*)s2
)&0x0000ff00)/bd
)<<8;
873 DWORD R
= (((*((DWORD
*)s2
)&0x00ff0000)>>8)/bd
)<<16;
875 | (0xff000000-(*((DWORD
*)s2
)&0xff000000))&0xff000000;
881 case MSP_AYUV
: //ToDo: fix me MSP_VUYA indeed?
882 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
885 BYTE
* s2end
= s2
+ w
*4;
886 DWORD
* d2
= (DWORD
*)d
;
887 for(; s2
< s2end
; s2
+= 4, d2
++)
890 DWORD ia
= 256-s2
[3];
892 *d2
= ((((*d2
&0x00ff00ff)*s2
[3])>>8) + (((*((DWORD
*)s2
)&0x00ff00ff)*ia
)>>8)&0x00ff00ff)
893 | ((((*d2
&0x0000ff00)*s2
[3])>>8) + (((*((DWORD
*)s2
)&0x0000ff00)*ia
)>>8)&0x0000ff00);
898 *d2
= (((((*d2
&0x00ff00ff)*s2
[3])>>8) + (*((DWORD
*)s2
)&0x00ff00ff))&0x00ff00ff)
899 | (((((*d2
&0x0000ff00)*s2
[3])>>8) + (*((DWORD
*)s2
)&0x0000ff00))&0x0000ff00);
906 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
909 BYTE
* s2end
= s2
+ w
*4;
911 for(; s2
< s2end
; s2
+= 4, d2
+= 3)
915 d2
[0] = ((d2
[0]*s2
[3])>>8) + s2
[0];
916 d2
[1] = ((d2
[1]*s2
[3])>>8) + s2
[1];
917 d2
[2] = ((d2
[2]*s2
[3])>>8) + s2
[2];
923 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
926 BYTE
* s2end
= s2
+ w
*4;
928 for(; s2
< s2end
; s2
+= 4, d2
++)
932 *d2
= (WORD
)((((((*d2
&0xf81f)*s2
[3])>>5) + (*(DWORD
*)s2
&0xf81f))&0xf81f)
933 | (((((*d2
&0x07e0)*s2
[3])>>5) + (*(DWORD
*)s2
&0x07e0))&0x07e0));
934 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
935 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
936 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
943 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
946 BYTE
* s2end
= s2
+ w
*4;
948 for(; s2
< s2end
; s2
+= 4, d2
++)
952 *d2
= (WORD
)((((((*d2
&0x7c1f)*s2
[3])>>5) + (*(DWORD
*)s2
&0x7c1f))&0x7c1f)
953 | (((((*d2
&0x03e0)*s2
[3])>>5) + (*(DWORD
*)s2
&0x03e0))&0x03e0));
954 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
955 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
956 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
963 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
967 BYTE
* s2end
= s2
+ w
*4;
968 DWORD
* d2
= (DWORD
*)d
;
969 for(; s2
< s2end
; s2
+= 8, d2
++)
971 ia
= (s2
[3]+s2
[7])>>1;
974 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
975 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
976 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
977 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
978 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
980 ia
= (ia
<<24)|(s2
[7]<<16)|(ia
<<8)|s2
[3];
981 c
= (s2
[4]<<24)|(s2
[5]<<16)|(s2
[0]<<8)|s2
[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
992 psraw mm4
, 1 //or else, overflow because psraw shift in sign bit
1007 //dst.pitch = abs(dst.pitch);
1011 dst
.pitchUV
= abs(dst
.pitch
)/2;
1013 if(!dst
.bitsU
|| !dst
.bitsV
)
1015 dst
.bitsU
= (BYTE
*)dst
.bits
+ abs(dst
.pitch
)*dst
.h
;
1016 dst
.bitsV
= dst
.bitsU
+ dst
.pitchUV
*dst
.h
/2;
1017 if(dst
.type
== MSP_YV12
)
1019 BYTE
* p
= dst
.bitsU
;
1020 dst
.bitsU
= dst
.bitsV
;
1025 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1026 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1027 if(rd
.top
> rd
.bottom
)
1029 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1030 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1031 dst
.pitchUV
= -dst
.pitchUV
;
1034 BYTE
* src_origin
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ rs
.left
;
1037 ss
[0] = src_origin
+ src
.pitch
*src
.h
*2;//U
1038 ss
[1] = src_origin
+ src
.pitch
*src
.h
*3;//V
1040 AlphaBltYv12Luma( d
, dst
.pitch
, w
, h
, src_origin
+ src
.pitch
*src
.h
, src_origin
, src
.pitch
);
1042 AlphaBltYv12Chroma( dd
[0], dst
.pitchUV
, w
, h2
, ss
[0], src_origin
, src
.pitch
);
1043 AlphaBltYv12Chroma( dd
[1], dst
.pitchUV
, w
, h2
, ss
[1], src_origin
, src
.pitch
);
1053 //emmsÒª40¸öcpuÖÜÆÚ
1058 STDMETHODIMP
CMemSubPic::AlphaBltAxyuAxyv_P010(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1060 const SubPicDesc
& src
= m_spd
;
1061 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1063 CRect
rs(*pSrc
), rd(*pDst
);
1067 rd
.bottom
= dst
.h
- rd
.bottom
;
1068 rd
.top
= dst
.h
- rd
.top
;
1071 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1072 return E_INVALIDARG
;
1075 int w
= rs
.Width(), h
= rs
.Height();
1078 BYTE
* s
= static_cast<BYTE
*>(src
.bits
) + src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);
1079 BYTE
* d
= static_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*rd
.top
+ rd
.left
*2;
1081 if(rd
.top
> rd
.bottom
) {
1082 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + rd
.left
;
1084 dst
.pitch
= -dst
.pitch
;
1087 for(ptrdiff_t i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
1090 BYTE
* s2end
= s2
+ w
*4;
1091 WORD
* d2
= reinterpret_cast<WORD
*>(d
);
1092 for(; s2
< s2end
; s2
+= 4, d2
++)
1095 d2
[0] = ((d2
[0]*s2
[3])>>8) + (s2
[1]<<8);
1104 dst
.pitchUV
= abs(dst
.pitch
);
1106 if(!dst
.bitsU
|| !dst
.bitsV
)
1108 dst
.bitsU
= static_cast<BYTE
*>(dst
.bits
) + abs(dst
.pitch
)*dst
.h
;
1109 dst
.bitsV
= dst
.bitsU
+ 2;
1111 BYTE
* ddUV
= dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
*2;
1112 if(rd
.top
> rd
.bottom
)
1114 ddUV
= dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
*2;
1115 dst
.pitchUV
= -dst
.pitchUV
;
1118 s
= static_cast<BYTE
*>(src
.bits
) + src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);
1121 int pitch
= src
.pitch
;
1122 for(int j
= 0; j
< h2
; j
++, s
+= 2*src
.pitch
, d
+= dst
.pitchUV
)
1125 WORD
* d2
=reinterpret_cast<WORD
*>(d
);
1126 WORD
* d2_end
= reinterpret_cast<WORD
*>(d
+2*w
);
1127 for( ; d2
<d2_end
; s2
+=8, d2
+=2)
1131 s2
[3+src
.pitch
]+s2
[3+4+src
.pitch
]);
1134 d2
[0] = (((d2
[0])*ia
)>>10) + ((s2
[0] + s2
[0+src
.pitch
])<<7);
1135 d2
[1] = (((d2
[1])*ia
)>>10) + ((s2
[4] + s2
[4+src
.pitch
])<<7);
1143 STDMETHODIMP
CMemSubPic::AlphaBltAxyuAxyv_Yv12(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1145 const SubPicDesc
& src
= m_spd
;
1146 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1148 CRect
rs(*pSrc
), rd(*pDst
);
1152 rd
.bottom
= dst
.h
- rd
.bottom
;
1153 rd
.top
= dst
.h
- rd
.top
;
1156 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1157 return E_INVALIDARG
;
1160 int w
= rs
.Width(), h
= rs
.Height();
1162 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);
1163 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*rd
.top
+ rd
.left
;
1165 if(rd
.top
> rd
.bottom
) {
1166 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + rd
.left
;
1168 dst
.pitch
= -dst
.pitch
;
1171 for(ptrdiff_t j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
) {
1173 BYTE
* s2end
= s2
+ w
*4;
1175 for(; s2
< s2end
; s2
+= 4, d2
++) {
1177 d2
[0] = ((d2
[0]*s2
[3])>>8) + s2
[1];
1181 dst
.pitch
= abs(dst
.pitch
);
1186 dst
.pitchUV
= dst
.pitch
/2;
1190 ss
[0] = (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ rs
.left
*4;
1193 if(!dst
.bitsU
|| !dst
.bitsV
) {
1194 dst
.bitsU
= (BYTE
*)dst
.bits
+ dst
.pitch
*dst
.h
;
1195 dst
.bitsV
= dst
.bitsU
+ dst
.pitchUV
*dst
.h
/2;
1197 if(dst
.type
== MSP_YV12
) {
1198 BYTE
* p
= dst
.bitsU
;
1199 dst
.bitsU
= dst
.bitsV
;
1205 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1206 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1208 if(rd
.top
> rd
.bottom
) {
1209 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1210 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1211 dst
.pitchUV
= -dst
.pitchUV
;
1214 for(ptrdiff_t i
= 0; i
< 2; i
++) {
1218 for(ptrdiff_t j
= 0; j
< h2
; j
++, s
+= src
.pitch
*2, d
+= dst
.pitchUV
, is
+= src
.pitch
*2) {
1220 BYTE
* s2end
= s2
+ w
*4;
1223 for(; s2
< s2end
; s2
+= 8, d2
++, is2
+= 8) {
1224 unsigned int ia
= (s2
[3]+s2
[3+src
.pitch
]+is2
[3]+is2
[3+src
.pitch
])>>2;
1226 *d2
= ((*d2
*ia
)>>8) + ((s2
[0]+s2
[src
.pitch
])>>1);
1235 STDMETHODIMP
CMemSubPic::AlphaBltAxyuAxyv_Nv12(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1237 const SubPicDesc
& src
= m_spd
;
1238 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1240 CRect
rs(*pSrc
), rd(*pDst
);
1244 rd
.bottom
= dst
.h
- rd
.bottom
;
1245 rd
.top
= dst
.h
- rd
.top
;
1248 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1249 return E_INVALIDARG
;
1252 int w
= rs
.Width(), h
= rs
.Height();
1254 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);
1255 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*rd
.top
+ rd
.left
;
1257 if(rd
.top
> rd
.bottom
) {
1258 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + rd
.left
;
1260 dst
.pitch
= -dst
.pitch
;
1263 for(ptrdiff_t j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
) {
1265 BYTE
* s2end
= s2
+ w
*4;
1267 for(; s2
< s2end
; s2
+= 4, d2
++) {
1269 d2
[0] = ((d2
[0]*s2
[3])>>8) + s2
[1];
1273 dst
.pitch
= abs(dst
.pitch
);
1278 dst
.pitchUV
= dst
.pitch
;
1282 ss
[0] = (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ rs
.left
*4;
1285 if(!dst
.bitsU
|| !dst
.bitsV
) {
1286 dst
.bitsU
= (BYTE
*)dst
.bits
+ dst
.pitch
*dst
.h
;
1287 dst
.bitsV
= dst
.bitsU
+ 1;
1289 if(dst
.type
== MSP_NV21
) {
1290 BYTE
* p
= dst
.bitsU
;
1291 dst
.bitsU
= dst
.bitsV
;
1297 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
;
1300 if(rd
.top
> rd
.bottom
) {
1301 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
;
1303 dst
.pitchUV
= -dst
.pitchUV
;
1306 for(ptrdiff_t i
= 0; i
< 2; i
++) {
1310 for(ptrdiff_t j
= 0; j
< h2
; j
++, s
+= src
.pitch
*2, d
+= dst
.pitchUV
, is
+= src
.pitch
*2) {
1312 BYTE
* s2end
= s2
+ w
*4;
1315 for(; s2
< s2end
; s2
+= 8, d2
+=2, is2
+= 8) {
1316 unsigned int ia
= (s2
[3]+s2
[3+src
.pitch
]+is2
[3]+is2
[3+src
.pitch
])>>2;
1318 *d2
= ((*d2
*ia
)>>8) + ((s2
[0]+s2
[src
.pitch
])>>1);
1327 STDMETHODIMP
CMemSubPic::AlphaBltAnv12_P010( const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1329 //fix me: check colorspace and log error
1330 const SubPicDesc
& src
= m_spd
;
1331 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1333 CRect
rs(*pSrc
), rd(*pDst
);
1337 rd
.bottom
= dst
.h
- rd
.bottom
;
1338 rd
.top
= dst
.h
- rd
.top
;
1340 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1341 return E_INVALIDARG
;
1343 int w
= rs
.Width(), h
= rs
.Height();
1344 bool bottom_down
= rs
.top
> rd
.bottom
;
1346 BYTE
* d
= reinterpret_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*rd
.top
+ rd
.left
*2;
1349 d
= reinterpret_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*(rd
.top
-1) + rd
.left
*2;
1350 dst
.pitch
= -dst
.pitch
;
1353 //dst.pitch = abs(dst.pitch);
1357 dst
.pitchUV
= abs(dst
.pitch
);
1359 dst
.bitsU
= reinterpret_cast<BYTE
*>(dst
.bits
) + abs(dst
.pitch
)*dst
.h
;
1360 BYTE
* ddUV
= dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
*2;
1363 ddUV
= dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
*2;
1364 dst
.pitchUV
= -dst
.pitchUV
;
1367 BYTE
* src_origin
= reinterpret_cast<BYTE
*>(src
.bits
) + src
.pitch
*rs
.top
+ rs
.left
;
1368 BYTE
*s
= src_origin
;
1371 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
1372 // && (reinterpret_cast<intptr_t>(d2)&15)==0 )
1373 if( ((reinterpret_cast<intptr_t>(s
) | static_cast<intptr_t>(src
.pitch
) |
1374 reinterpret_cast<intptr_t>(d
) | static_cast<intptr_t>(dst
.pitch
) ) & 15 )==0 )
1376 for(int i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
1379 BYTE
* s2
= s
+ src
.pitch
*src
.h
;
1380 BYTE
* s2end_mod16
= s2
+ (w
&~15);
1381 BYTE
* s2end
= s2
+ w
;
1384 for(; s2
< s2end_mod16
; s2
+=16, sa
+=16, d2
+=32)
1386 mix_16_y_p010_sse2(d2
, s2
, sa
);
1388 for( WORD
* d3
=reinterpret_cast<WORD
*>(d2
); s2
< s2end
; s2
++, sa
++, d3
++)
1392 d2
[0] = ((d2
[0]*sa
[0])>>8) + (s2
[0]<<8);
1397 else //fix me: only a workaround for non-mod-16 size video
1399 for(int i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
1402 BYTE
* s2
= s
+ src
.pitch
*src
.h
;
1403 BYTE
* s2end_mod16
= s2
+ (w
&~15);
1404 BYTE
* s2end
= s2
+ w
;
1405 WORD
* d2
= reinterpret_cast<WORD
*>(d
);
1406 for(; s2
< s2end
; s2
+=1, sa
+=1, d2
+=1)
1410 d2
[0] = ((d2
[0]*sa
[0])>>8) + (s2
[0]<<8);
1417 BYTE
* sa
= src_origin
;
1418 BYTE
* s_uv
= src_origin
+ src
.pitch
*src
.h
*2;//UV
1419 if( ((reinterpret_cast<intptr_t>(sa
) | static_cast<intptr_t>(src
.pitch
) |
1420 reinterpret_cast<intptr_t>(d
) | static_cast<intptr_t>(dst
.pitch
) ) & 15 )==0 )
1422 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
1426 BYTE
* s_u2end_mod16
= s_u2
+ (w
&~15);
1427 BYTE
* s_u2end
= s_u2
+ w
;
1430 for(; s_u2
< s_u2end_mod16
; s_u2
+=16, sa2
+=16, d2
+=32)
1432 mix_16_uv_p010_sse2(d2
, s_u2
, sa2
, src
.pitch
);
1435 for( WORD
* d3
=reinterpret_cast<WORD
*>(d2
); s_u2
< s_u2end
; s_u2
+=2, sa2
+=2, d3
+=2)
1439 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
]);
1442 d3
[0] = (((d3
[0])*ia
)>>10) + (s_u2
[0]<<8);
1443 d3
[1] = (((d3
[1])*ia
)>>10) + (s_u2
[1]<<8);
1450 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
1454 BYTE
* s_u2end
= s_u2
+ w
;
1457 for( WORD
* d3
=reinterpret_cast<WORD
*>(d2
); s_u2
< s_u2end
; s_u2
+=2, sa2
+=2, d3
+=2)
1461 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
]);
1464 d3
[0] = (((d3
[0])*ia
)>>10) + (s_u2
[0]<<8);
1465 d3
[1] = (((d3
[1])*ia
)>>10) + (s_u2
[1]<<8);
1473 STDMETHODIMP
CMemSubPic::AlphaBltAnv12_Nvxx( const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1475 //fix me: check colorspace and log error
1476 const SubPicDesc
& src
= m_spd
;
1477 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1479 CRect
rs(*pSrc
), rd(*pDst
);
1483 rd
.bottom
= dst
.h
- rd
.bottom
;
1484 rd
.top
= dst
.h
- rd
.top
;
1486 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1487 return E_INVALIDARG
;
1489 int w
= rs
.Width(), h
= rs
.Height();
1490 bool bottom_down
= rs
.top
> rd
.bottom
;
1492 BYTE
* d
= reinterpret_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*rd
.top
+ rd
.left
;
1495 d
= reinterpret_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*(rd
.top
-1) + rd
.left
;
1496 dst
.pitch
= -dst
.pitch
;
1499 //dst.pitch = abs(dst.pitch);
1503 dst
.pitchUV
= abs(dst
.pitch
);
1507 dst
.bitsU
= reinterpret_cast<BYTE
*>(dst
.bits
) + abs(dst
.pitch
)*dst
.h
;
1509 BYTE
* ddUV
= dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
;
1512 ddUV
= dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
;
1513 dst
.pitchUV
= -dst
.pitchUV
;
1516 BYTE
* sa
= reinterpret_cast<BYTE
*>(src
.bits
) + src
.pitch
*rs
.top
+ rs
.left
;
1518 BYTE
* s_uv
= sa
+ src
.pitch
*src
.h
*2;//UV
1520 AlphaBltYv12Luma( d
, dst
.pitch
, w
, h
, sa
+ src
.pitch
*src
.h
, sa
, src
.pitch
);
1521 if( ((reinterpret_cast<intptr_t>(sa
) | static_cast<intptr_t>(src
.pitch
) |
1522 reinterpret_cast<intptr_t>(ddUV
) | static_cast<intptr_t>(dst
.pitchUV
) ) & 15 )==0 )
1525 int pitch
= src
.pitch
;
1526 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
1530 BYTE
* s_u2end_mod16
= s_u2
+ (w
&~15);
1531 BYTE
* s_u2end
= s_u2
+ w
;
1534 for(; s_u2
< s_u2end_mod16
; s_u2
+=16, sa2
+=16, d2
+=16)
1536 mix_16_uv_nvxx_sse2(d2
, s_u2
, sa2
, src
.pitch
);
1538 for( BYTE
* d3
=d2
; s_u2
< s_u2end
; s_u2
+=2, sa2
+=2, d3
+=2)
1542 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
]);
1545 d3
[0] = (((d3
[0])*ia
)>>10) + s_u2
[0];
1546 d3
[1] = (((d3
[1])*ia
)>>10) + s_u2
[1];
1554 int pitch
= src
.pitch
;
1555 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
1559 BYTE
* s_u2end_mod16
= s_u2
+ (w
&~15);
1560 BYTE
* s_u2end
= s_u2
+ w
;
1563 for( BYTE
* d3
=d2
; s_u2
< s_u2end
; s_u2
+=2, sa2
+=2, d3
+=2)
1567 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
]);
1570 d3
[0] = (((d3
[0])*ia
)>>10) + s_u2
[0];
1571 d3
[1] = (((d3
[1])*ia
)>>10) + s_u2
[1];
1580 STDMETHODIMP
CMemSubPic::SetDirtyRectEx(CAtlList
<CRect
>* dirtyRectList
)
1582 //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1583 if(dirtyRectList
!=NULL
)
1585 POSITION pos
= dirtyRectList
->GetHeadPosition();
1586 if(m_spd
.type
== MSP_AYUV_PLANAR
|| m_alpha_blt_dst_type
==MSP_IYUV
|| m_alpha_blt_dst_type
==MSP_YV12
1587 || m_alpha_blt_dst_type
==MSP_P010
|| m_alpha_blt_dst_type
==MSP_P016
1588 || m_alpha_blt_dst_type
==MSP_NV12
|| m_alpha_blt_dst_type
==MSP_NV21
)
1592 CRect
& cRectSrc
= dirtyRectList
->GetNext(pos
);
1593 cRectSrc
.left
&= ~15;
1594 cRectSrc
.right
= (cRectSrc
.right
+15)&~15;
1596 cRectSrc
.bottom
= (cRectSrc
.bottom
+1)&~1;
1599 else if(m_spd
.type
== MSP_XY_AUYV
|| m_alpha_blt_dst_type
==MSP_YUY2
)
1603 CRect
& cRectSrc
= dirtyRectList
->GetNext(pos
);
1604 cRectSrc
.left
&= ~3;
1605 cRectSrc
.right
= (cRectSrc
.right
+3)&~3;
1609 return __super::SetDirtyRectEx(dirtyRectList
);
1613 // CMemSubPicAllocator
1616 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type
, SIZE maxsize
, int type
/*=-1*/)
1617 : CSubPicExAllocatorImpl(maxsize
, false, false)
1618 , m_alpha_blt_dst_type(alpha_blt_dst_type
)
1619 , m_maxsize(maxsize
)
1624 switch(alpha_blt_dst_type
)
1627 m_type
= MSP_XY_AUYV
;
1638 m_type
= MSP_AYUV_PLANAR
;
1647 // ISubPicAllocatorImpl
1649 bool CMemSubPicAllocator::AllocEx(bool fStatic
, ISubPicEx
** ppSubPic
)
1655 spd
.w
= m_maxsize
.cx
;
1656 spd
.h
= m_maxsize
.cy
;
1658 spd
.pitch
= (spd
.w
*spd
.bpp
)>>3;
1660 spd
.bits
= DNew BYTE
[spd
.pitch
*spd
.h
];
1664 *ppSubPic
= DNew
CMemSubPic(spd
, m_alpha_blt_dst_type
);
1668 (*ppSubPic
)->AddRef();