2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
23 #include "MemSubPic.h"
24 #include "color_conv_table.h"
26 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
27 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
28 m128_2 = _mm_slli_epi16(m128_1, 8); \
29 m128_1 = _mm_srli_epi16(m128_1, 8); \
30 m128_2 = _mm_srli_epi16(m128_2, 8); \
31 m128_1 = _mm_avg_epu8(m128_1, m128_2);
33 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
35 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
36 m128_2 = _mm_slli_epi16(m128_1, 8); \
37 __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
38 m128_2 = _mm_or_si128(m128_2, m128_3);\
39 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
42 void subsample_and_interlace_2_line_c(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
)
44 const BYTE
* end
= u
+ w
;
45 for (;u
<end
;dst
+=2,u
+=2,v
+=2)
47 dst
[0] = (u
[0] + u
[0+pitch
] + 1)/2;
48 int tmp1
= (u
[1] + u
[1+pitch
] + 1)/2;
49 dst
[0] = (dst
[0] + tmp1
+ 1)/2;
50 dst
[1] = (v
[0] + v
[0+pitch
] + 1)/2;
51 tmp1
= (v
[1] + v
[1+pitch
] + 1)/2;
52 dst
[1] = (dst
[1] + tmp1
+ 1)/2;
56 __forceinline
void subsample_and_interlace_2_line_sse2(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
)
58 const BYTE
* end
= u
+ w
;
59 for (;u
<end
;dst
+=16,u
+=16,v
+=16)
61 __m128i u_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
) );
62 __m128i u_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
+pitch
) );
63 __m128i v_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
) );
64 __m128i v_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
+pitch
) );
65 AVERAGE_4_PIX_INTRINSICS(u_1
, u_2
);
66 AVERAGE_4_PIX_INTRINSICS(v_1
, v_2
);
67 u_1
= _mm_packus_epi16(u_1
, u_1
);
68 v_1
= _mm_packus_epi16(v_1
, v_1
);
69 u_1
= _mm_unpacklo_epi8(u_1
, v_1
);
71 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), u_1
);
75 static __forceinline
void pix_alpha_blend_yv12_luma_sse2(byte
* dst
, const byte
* alpha
, const byte
* sub
)
77 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
78 __m128i alpha128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
) );
79 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(sub
) );
80 __m128i zero
= _mm_setzero_si128();
84 ones
= _mm_setzero_si128();//disable warning C4700
86 ones
= _mm_cmpeq_epi32(ones
,ones
);
87 ones
= _mm_cmpeq_epi8(ones
,alpha128
);
89 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
90 __m128i alpha_lo128
= _mm_unpacklo_epi8(alpha128
, zero
);
92 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
94 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha_lo128
);
95 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
96 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
98 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
99 alpha128
= _mm_unpackhi_epi8(alpha128
, zero
);
101 ones2
= _mm_unpackhi_epi8(ones
, zero
);
103 dst128
= _mm_mullo_epi16(dst128
, alpha128
);
104 dst128
= _mm_adds_epu16(dst128
, ones2
);
105 dst128
= _mm_srli_epi16(dst128
, 8);
106 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
108 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
109 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
113 * output not exactly identical to pix_alpha_blend_yv12_chroma
115 static __forceinline
void pix_alpha_blend_yv12_chroma_sse2(byte
* dst
, const byte
* src
, const byte
* alpha
, int src_pitch
)
117 __m128i zero
= _mm_setzero_si128();
118 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
) );
119 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
+src_pitch
) );
120 __m128i dst128
= _mm_loadl_epi64( reinterpret_cast<const __m128i
*>(dst
) );
122 __m128i sub128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
123 __m128i sub128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
+src_pitch
) );
125 AVERAGE_4_PIX_INTRINSICS(alpha128_1
, alpha128_2
);
129 ones
= _mm_setzero_si128();//disable warning C4700
131 ones
= _mm_cmpeq_epi32(ones
,ones
);
132 ones
= _mm_cmpeq_epi8(ones
, alpha128_1
);
134 dst128
= _mm_unpacklo_epi8(dst128
, zero
);
135 __m128i dst128_2
= _mm_and_si128(dst128
, ones
);
137 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
138 dst128
= _mm_adds_epu16(dst128
, dst128_2
);
140 dst128
= _mm_srli_epi16(dst128
, 8);
142 AVERAGE_4_PIX_INTRINSICS(sub128_1
, sub128_2
);
144 dst128
= _mm_adds_epi16(dst128
, sub128_1
);
145 dst128
= _mm_packus_epi16(dst128
, dst128
);
147 _mm_storel_epi64( reinterpret_cast<__m128i
*>(dst
), dst128
);
150 static __forceinline
void pix_alpha_blend_yv12_chroma(byte
* dst
, const byte
* src
, const byte
* alpha
, int src_pitch
)
152 unsigned int ia
= (alpha
[0]+alpha
[1]+
153 alpha
[0+src_pitch
]+alpha
[1+src_pitch
])>>2;
156 *dst
= (((*dst
)*ia
)>>8) + ((src
[0] +src
[1]+
157 src
[src_pitch
]+src
[1+src_pitch
] )>>2);
161 static void AlphaBltYv12Luma(byte
* dst
, int dst_pitch
,
163 const byte
* sub
, const byte
* alpha
, int sub_pitch
)
165 if( ((reinterpret_cast<intptr_t>(alpha
) | static_cast<intptr_t>(sub_pitch
) |
166 reinterpret_cast<intptr_t>(dst
) | static_cast<intptr_t>(dst_pitch
) ) & 15 )==0 )
168 for(int i
=0; i
<h
; i
++, dst
+= dst_pitch
, alpha
+= sub_pitch
, sub
+= sub_pitch
)
170 const BYTE
* sa
= alpha
;
171 const BYTE
* s2
= sub
;
172 const BYTE
* s2end_mod16
= s2
+ (w
&~15);
173 const BYTE
* s2end
= s2
+ w
;
176 for(; s2
< s2end_mod16
; s2
+=16, sa
+=16, d2
+=16)
178 pix_alpha_blend_yv12_luma_sse2(d2
, sa
, s2
);
180 for(; s2
< s2end
; s2
++, sa
++, d2
++)
184 d2
[0] = ((d2
[0]*sa
[0])>>8) + s2
[0];
189 else //fix me: only a workaround for non-mod-16 size video
191 for(int i
=0; i
<h
; i
++, dst
+= dst_pitch
, alpha
+= sub_pitch
, sub
+= sub_pitch
)
193 const BYTE
* sa
= alpha
;
194 const BYTE
* s2
= sub
;
195 const BYTE
* s2end_mod16
= s2
+ (w
&~15);
196 const BYTE
* s2end
= s2
+ w
;
198 for(; s2
< s2end
; s2
+=1, sa
+=1, d2
+=1)
202 // d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
203 d2
[0] = ((d2
[0]*sa
[0])>>8) + s2
[0];
210 static void AlphaBltYv12Chroma(byte
* dst
, int dst_pitch
,
212 const byte
* sub_chroma
, const byte
* alpha
, int sub_pitch
)
214 if( ((reinterpret_cast<intptr_t>(sub_chroma
) |
215 //reinterpret_cast<intptr_t>(dst) |
216 reinterpret_cast<intptr_t>(alpha
) | static_cast<intptr_t>(sub_pitch
)
217 //| (static_cast<intptr_t>(dst_pitch)&7)
220 int pitch
= sub_pitch
;
221 for(int j
= 0; j
< chroma_h
; j
++, sub_chroma
+= sub_pitch
*2, alpha
+= sub_pitch
*2, dst
+= dst_pitch
)
223 const BYTE
* s2
= sub_chroma
;
224 const BYTE
* sa2
= alpha
;
225 const BYTE
* s2end_mod16
= s2
+ (w
&~15);
226 const BYTE
* s2end
= s2
+ w
;
229 for(; s2
< s2end_mod16
; s2
+= 16, sa2
+= 16, d2
+=8)
231 pix_alpha_blend_yv12_chroma_sse2(d2
, s2
, sa2
, sub_pitch
);
233 for(; s2
< s2end
; s2
+=2, sa2
+=2, d2
++)
235 pix_alpha_blend_yv12_chroma(d2
, s2
, sa2
, sub_pitch
);
239 else//fix me: only a workaround for non-mod-16 size video
241 for(int j
= 0; j
< chroma_h
; j
++, sub_chroma
+= sub_pitch
*2, alpha
+= sub_pitch
*2, dst
+= dst_pitch
)
243 const BYTE
* s2
= sub_chroma
;
244 const BYTE
* sa2
= alpha
;
245 const BYTE
* s2end_mod16
= s2
+ (w
&~15);
246 const BYTE
* s2end
= s2
+ w
;
248 for(; s2
< s2end
; s2
+= 2, sa2
+= 2, d2
++)
250 pix_alpha_blend_yv12_chroma(d2
, s2
, sa2
, sub_pitch
);
256 __forceinline
void mix_16_y_p010_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
)
259 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
260 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
261 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
265 alpha_ff
= _mm_setzero_si128();//disable warning C4700
267 alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
269 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
271 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
272 //so we do it another way
273 //first, (alpha<<8)+0xff
274 __m128i ones
= _mm_setzero_si128();
275 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
279 ones2
= _mm_setzero_si128();//disable warning C4700
281 ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
283 ones
= _mm_xor_si128(ones
, ones2
);
284 ones
= _mm_srli_epi16(ones
, 15);
285 ones
= _mm_and_si128(ones
, lo
);
287 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
288 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
290 lo
= _mm_setzero_si128();
291 lo
= _mm_unpacklo_epi8(lo
, src_y
);
292 dst_y
= _mm_adds_epu16(dst_y
, lo
);
293 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
296 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
298 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
300 ones
= _mm_setzero_si128();
301 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
302 ones
= _mm_xor_si128(ones
, ones2
);
303 ones
= _mm_srli_epi16(ones
, 15);
304 ones
= _mm_and_si128(ones
, lo
);
306 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
307 dst_y
= _mm_adds_epu16(dst_y
, ones
);
309 lo
= _mm_setzero_si128();
310 lo
= _mm_unpackhi_epi8(lo
, src_y
);
311 dst_y
= _mm_adds_epu16(dst_y
, lo
);
312 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
316 void mix_16_y_p010_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
)
318 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
319 for (int i
=0;i
<16;i
++)
321 if (src_alpha
[i
]!=0xff)
323 dst_word
[i
] = ((dst_word
[i
] *src_alpha
[i
])>>8) + (src
[i
]<<8);
328 __forceinline
void mix_16_uv_p010_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
331 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
332 __m128i alpha2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
+pitch
) );
334 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
335 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
337 AVERAGE_4_PIX_INTRINSICS_2(alpha
, alpha2
);
341 alpha_ff
= _mm_setzero_si128();//disable warning C4700
343 alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
345 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
347 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
348 //so we do it another way
349 //first, (alpha<<8)+0xff
350 __m128i ones
= _mm_setzero_si128();
351 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
355 ones2
= _mm_setzero_si128();//disable warning C4700
357 ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
358 ones
= _mm_xor_si128(ones
, ones2
);
359 ones
= _mm_srli_epi16(ones
, 15);
360 ones
= _mm_and_si128(ones
, lo
);
362 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
363 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
365 lo
= _mm_setzero_si128();
366 lo
= _mm_unpacklo_epi8(lo
, src_y
);
367 dst_y
= _mm_adds_epu16(dst_y
, lo
);
368 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
371 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
373 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
375 ones
= _mm_setzero_si128();
376 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
377 ones
= _mm_xor_si128(ones
, ones2
);
378 ones
= _mm_srli_epi16(ones
, 15);
379 ones
= _mm_and_si128(ones
, lo
);
381 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
382 dst_y
= _mm_adds_epu16(dst_y
, ones
);
384 lo
= _mm_setzero_si128();
385 lo
= _mm_unpackhi_epi8(lo
, src_y
);
386 dst_y
= _mm_adds_epu16(dst_y
, lo
);
387 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
391 void mix_16_uv_p010_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
393 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
394 for (int i
=0;i
<8;i
++, src_alpha
+=2, src
+=2, dst_word
+=2)
397 (src_alpha
[0]+src_alpha
[0+pitch
]+1)/2+
398 (src_alpha
[1]+src_alpha
[1+pitch
]+1)/2+1)/2;
401 int tmp
= (((dst_word
[0])*ia
)>>8) + (src
[0]<<8);
402 if(tmp
>0xffff) tmp
= 0xffff;
404 tmp
= (((dst_word
[1])*ia
)>>8) + (src
[1]<<8);
405 if(tmp
>0xffff) tmp
= 0xffff;
411 __forceinline
void mix_16_uv_nvxx_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
413 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
414 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
415 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
+pitch
) );
416 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
418 AVERAGE_4_PIX_INTRINSICS_2(alpha128_1
, alpha128_2
);
419 __m128i zero
= _mm_setzero_si128();
423 ones
= _mm_setzero_si128();//disable warning C4700
425 ones
= _mm_cmpeq_epi32(ones
,ones
);
426 ones
= _mm_cmpeq_epi8(ones
,alpha128_1
);
428 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
429 alpha128_2
= _mm_unpacklo_epi8(alpha128_1
, zero
);
431 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
433 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha128_2
);
434 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
435 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
437 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
438 alpha128_1
= _mm_unpackhi_epi8(alpha128_1
, zero
);
440 ones2
= _mm_unpackhi_epi8(ones
, zero
);
442 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
443 dst128
= _mm_adds_epu16(dst128
, ones2
);
444 dst128
= _mm_srli_epi16(dst128
, 8);
445 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
447 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
448 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
452 void mix_16_uv_nvxx_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
454 for (int i
=0;i
<8;i
++, src_alpha
+=2, src
+=2, dst
+=2)
457 (src_alpha
[0]+src_alpha
[0+pitch
]+1)/2+
458 (src_alpha
[1]+src_alpha
[1+pitch
]+1)/2+1)/2;
461 dst
[0] = (((dst
[0])*ia
)>>8) + src
[0];
462 dst
[1] = (((dst
[1])*ia
)>>8) + src
[1];
471 CMemSubPic::CMemSubPic(SubPicDesc
& spd
, int alpha_blt_dst_type
)
472 : m_spd(spd
), m_alpha_blt_dst_type(alpha_blt_dst_type
)
474 m_maxsize
.SetSize(spd
.w
, spd
.h
);
475 // m_rcDirty.SetRect(0, 0, spd.w, spd.h);
476 CRect
allSpd(0,0,spd
.w
, spd
.h
);
477 m_rectListDirty
.AddTail(allSpd
);
480 CMemSubPic::~CMemSubPic()
482 delete [] m_spd
.bits
, m_spd
.bits
= NULL
;
487 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
489 return (void*)&m_spd
;
492 STDMETHODIMP
CMemSubPic::GetDesc(SubPicDesc
& spd
) const
494 spd
.type
= m_spd
.type
;
498 spd
.pitch
= m_spd
.pitch
;
499 spd
.bits
= m_spd
.bits
;
500 spd
.bitsU
= m_spd
.bitsU
;
501 spd
.bitsV
= m_spd
.bitsV
;
502 spd
.vidrect
= m_vidrect
;
506 STDMETHODIMP
CMemSubPic::CopyTo(ISubPicEx
* pSubPic
)
509 if(FAILED(hr
= __super::CopyTo(pSubPic
))) {
514 if(FAILED(GetDesc(src
)) || FAILED(pSubPic
->GetDesc(dst
))) {
517 while(!m_rectListDirty
.IsEmpty())
519 CRect
& cRect
= m_rectListDirty
.GetHead();
520 int w
= cRect
.Width(), h
= cRect
.Height();
521 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*cRect
.top
+ cRect
.left
*4;
522 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*cRect
.top
+ cRect
.left
*4;
523 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
529 STDMETHODIMP
CMemSubPic::ClearDirtyRect(DWORD color
)
531 if(m_rectListDirty
.IsEmpty()) {
534 while(!m_rectListDirty
.IsEmpty())
536 //pDirtyRect = m_rectListDirty.RemoveHead();
537 CRect
& dirtyRect
= m_rectListDirty
.RemoveTail();
538 BYTE
* p
= (BYTE
*)m_spd
.bits
+ m_spd
.pitch
*(dirtyRect
.top
) + dirtyRect
.left
*(m_spd
.bpp
>>3);
539 int w
= dirtyRect
.Width();
540 if(m_spd
.type
!=MSP_AYUV_PLANAR
)
542 for(int j
= 0, h
= dirtyRect
.Height(); j
< h
; j
++, p
+= m_spd
.pitch
)
545 memsetd(p
, color
, w
*4); // nya
563 for(int j
= 0, h
= dirtyRect
.Height(); j
< h
; j
++, p
+= m_spd
.pitch
)
565 // memsetd(p, 0, m_rcDirty.Width());
566 //DbgLog((LOG_TRACE, 3, "w:%d", w));
567 //w = pDirtyRect->Width();
569 memset(p
+m_spd
.h
*m_spd
.pitch
, 0, w
);
570 memset(p
+m_spd
.h
*m_spd
.pitch
*2, 0, w
);
571 memset(p
+m_spd
.h
*m_spd
.pitch
*3, 0, w
);
575 m_rectListDirty
.RemoveAll();
579 STDMETHODIMP
CMemSubPic::Lock(SubPicDesc
& spd
)
584 STDMETHODIMP
CMemSubPic::Unlock( CAtlList
<CRect
>* dirtyRectList
)
586 int src_type
= m_spd
.type
;
587 int dst_type
= m_alpha_blt_dst_type
;
588 if( (src_type
==MSP_RGBA
&& (dst_type
== MSP_RGB32
||
589 dst_type
== MSP_RGB24
||
590 dst_type
== MSP_RGB16
||
591 dst_type
== MSP_RGB15
))
593 (src_type
==MSP_XY_AUYV
&& dst_type
== MSP_YUY2
)//ToDo: fix me MSP_AYUV
595 (src_type
==MSP_AYUV
&& dst_type
== MSP_AYUV
)
597 (src_type
==MSP_AYUV_PLANAR
&& (dst_type
== MSP_IYUV
||
598 dst_type
== MSP_YV12
||
599 dst_type
== MSP_P010
||
600 dst_type
== MSP_P016
||
601 dst_type
== MSP_NV12
||
602 dst_type
== MSP_NV21
)))
604 return UnlockOther(dirtyRectList
);
606 else if(src_type
==MSP_RGBA
&& (dst_type
== MSP_YUY2
||
607 dst_type
== MSP_AYUV
|| //ToDo: fix me MSP_AYUV
608 dst_type
== MSP_IYUV
||
609 dst_type
== MSP_YV12
||
610 dst_type
== MSP_NV12
||
611 dst_type
== MSP_NV21
||
612 dst_type
== MSP_P010
||
613 dst_type
== MSP_P016
))
615 return UnlockRGBA_YUV(dirtyRectList
);
620 STDMETHODIMP
CMemSubPic::UnlockOther(CAtlList
<CRect
>* dirtyRectList
)
622 SetDirtyRectEx(dirtyRectList
);
623 if(m_rectListDirty
.IsEmpty()) {
627 POSITION pos
= m_rectListDirty
.GetHeadPosition();
630 const CRect
& cRect
= m_rectListDirty
.GetNext(pos
);
631 int w
= cRect
.Width(), h
= cRect
.Height();
632 BYTE
* top
= (BYTE
*)m_spd
.bits
+ m_spd
.pitch
*(cRect
.top
) + cRect
.left
*4;
633 BYTE
* bottom
= top
+ m_spd
.pitch
*h
;
634 if(m_alpha_blt_dst_type
== MSP_RGB16
)
636 for(; top
< bottom
; top
+= m_spd
.pitch
)
638 DWORD
* s
= (DWORD
*)top
;
642 *s
= ((*s
>>3)&0x1f000000)|((*s
>>8)&0xf800)|((*s
>>5)&0x07e0)|((*s
>>3)&0x001f);
643 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
647 else if(m_alpha_blt_dst_type
== MSP_RGB15
)
649 for(; top
< bottom
; top
+= m_spd
.pitch
)
651 DWORD
* s
= (DWORD
*)top
;
655 *s
= ((*s
>>3)&0x1f000000)|((*s
>>9)&0x7c00)|((*s
>>6)&0x03e0)|((*s
>>3)&0x001f);
656 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
660 else if(m_alpha_blt_dst_type
== MSP_YUY2
)
662 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top
, m_spd
.pitch
*(h
-1)) );
664 for(BYTE
* tempTop
=top
; tempTop
< bottom
; tempTop
+= m_spd
.pitch
)
668 for(; s
< e
; s
+=8) // AUYV AUYV -> AxYU AxYV
670 s
[4] = (s
[0] + s
[4])>>1;
671 s
[0] = (s
[2] + s
[6])>>1;
675 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top
, m_spd
.pitch
*(h
-1)) );
677 else if(m_alpha_blt_dst_type
== MSP_YV12
|| m_alpha_blt_dst_type
== MSP_IYUV
)
681 else if ( m_alpha_blt_dst_type
== MSP_P010
|| m_alpha_blt_dst_type
== MSP_P016
682 || m_alpha_blt_dst_type
== MSP_NV12
)
684 SubsampleAndInterlace(cRect
, true);
686 else if( m_alpha_blt_dst_type
== MSP_NV21
)
688 SubsampleAndInterlace(cRect
, false);
694 STDMETHODIMP
CMemSubPic::UnlockRGBA_YUV(CAtlList
<CRect
>* dirtyRectList
)
696 SetDirtyRectEx(dirtyRectList
);
697 if(m_rectListDirty
.IsEmpty()) {
701 const ColorConvTable
*conv_table
= ColorConvTable::GetDefaultColorConvTable();
702 const int *c2y_yb
= conv_table
->c2y_yb
;
703 const int *c2y_yg
= conv_table
->c2y_yg
;
704 const int *c2y_yr
= conv_table
->c2y_yr
;
705 const int cy_cy2
= conv_table
->cy_cy2
;
706 const int c2y_cu
= conv_table
->c2y_cu
;
707 const int c2y_cv
= conv_table
->c2y_cv
;
708 const int cy_cy
= conv_table
->cy_cy
;
709 const unsigned char* Clip
= conv_table
->Clip
;
711 POSITION pos
= m_rectListDirty
.GetHeadPosition();
714 const CRect
& cRect
= m_rectListDirty
.GetNext(pos
);
715 int w
= cRect
.Width(), h
= cRect
.Height();
717 BYTE
* top
= (BYTE
*)m_spd
.bits
+ m_spd
.pitch
*cRect
.top
+ cRect
.left
*4;
718 BYTE
* bottom
= top
+ m_spd
.pitch
*h
;
720 if( m_alpha_blt_dst_type
== MSP_YUY2
||
721 m_alpha_blt_dst_type
== MSP_YV12
||
722 m_alpha_blt_dst_type
== MSP_IYUV
||
723 m_alpha_blt_dst_type
== MSP_P010
||
724 m_alpha_blt_dst_type
== MSP_P016
||
725 m_alpha_blt_dst_type
== MSP_NV12
||
726 m_alpha_blt_dst_type
== MSP_NV21
) {
727 for(; top
< bottom
; top
+= m_spd
.pitch
) {
730 for(; s
< e
; s
+=8) { // ARGB ARGB -> AxYU AxYV
731 if((s
[3]+s
[7]) < 0x1fe) {
732 int a
= 0x200 - (s
[3]+s
[7]);
735 s
[1] = (c2y_yb
[s
[0]] + c2y_yg
[s
[1]] + c2y_yr
[s
[2]] + 0x10*a
+ 0x8000) >> 16;
736 s
[5] = (c2y_yb
[s
[4]] + c2y_yg
[s
[5]] + c2y_yr
[s
[6]] + 0x10*a
+ 0x8000) >> 16;
738 int scaled_y
= (s
[1]+s
[5]-32) * cy_cy2
;
740 s
[0] = Clip
[(((((s
[0]+s
[4])<<15) - scaled_y
) >> 10) * c2y_cu
+ 0x80*a
+ 0x8000) >> 16];
741 s
[4] = Clip
[(((((s
[2]+s
[6])<<15) - scaled_y
) >> 10) * c2y_cv
+ 0x80*a
+ 0x8000) >> 16];
749 else if(m_alpha_blt_dst_type
== MSP_AYUV
) {
750 for(; top
< bottom
; top
+= m_spd
.pitch
) {
753 for(; s
< e
; s
+=4) { // ARGB -> AYUV
755 int a
= 0x100 - s
[3];
759 int y
= (c2y_yb
[s
[0]] + c2y_yg
[s
[1]] + c2y_yr
[s
[2]] + 0x10*a
+ 0x8000) >> 16;
760 int scaled_y
= (y
-32) * cy_cy
;
761 s
[1] = Clip
[((((s
[0]<<16) - scaled_y
) >> 10) * c2y_cu
+ 0x80*a
+ 0x8000) >> 16];
762 s
[0] = Clip
[((((s
[2]<<16) - scaled_y
) >> 10) * c2y_cv
+ 0x80*a
+ 0x8000) >> 16];
775 void CMemSubPic::SubsampleAndInterlace( const CRect
& cRect
, bool u_first
)
777 //fix me: check alignment and log error
778 int w
= cRect
.Width(), h
= cRect
.Height();
779 BYTE
* u_plan
= reinterpret_cast<BYTE
*>(m_spd
.bits
) + m_spd
.pitch
*m_spd
.h
*2;
780 BYTE
* u_start
= u_plan
+ m_spd
.pitch
*(cRect
.top
)+ cRect
.left
;
781 BYTE
* v_start
= u_start
+ m_spd
.pitch
*m_spd
.h
;
791 //Walkarround for alignment
792 if ( (m_spd
.pitch
&15) == 0 )
794 for (int i
=0;i
<h
;i
+=2)
796 subsample_and_interlace_2_line_sse2(dst
, u_start
, v_start
, w
, m_spd
.pitch
);
797 u_start
+= 2*m_spd
.pitch
;
798 v_start
+= 2*m_spd
.pitch
;
804 for (int i
=0;i
<h
;i
+=2)
806 subsample_and_interlace_2_line_c(dst
, u_start
, v_start
, w
, m_spd
.pitch
);
807 u_start
+= 2*m_spd
.pitch
;
808 v_start
+= 2*m_spd
.pitch
;
814 STDMETHODIMP
CMemSubPic::AlphaBlt( const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
816 if(!pSrc
|| !pDst
|| !pTarget
) {
819 int src_type
= m_spd
.type
;
820 int dst_type
= pTarget
->type
;
822 if( (src_type
==MSP_RGBA
&& (dst_type
== MSP_RGB32
||
823 dst_type
== MSP_RGB24
||
824 dst_type
== MSP_RGB16
||
825 dst_type
== MSP_RGB15
||
826 dst_type
== MSP_RGBA
||
827 dst_type
== MSP_YUY2
||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
828 dst_type
== MSP_AYUV
))
830 (src_type
==MSP_XY_AUYV
&& dst_type
== MSP_YUY2
)//ToDo: fix me MSP_AYUV
832 (src_type
==MSP_AYUV
&& dst_type
== MSP_AYUV
)
834 (src_type
==MSP_AYUV_PLANAR
&& (dst_type
== MSP_IYUV
||
835 dst_type
== MSP_YV12
)) )
837 return AlphaBltOther(pSrc
, pDst
, pTarget
);
839 else if ( src_type
==MSP_AYUV_PLANAR
&& (dst_type
== MSP_NV12
||
840 dst_type
== MSP_NV21
) )
842 return AlphaBltAnv12_Nvxx(pSrc
, pDst
, pTarget
);
845 else if( src_type
==MSP_AYUV_PLANAR
&& (dst_type
== MSP_P010
||
846 dst_type
== MSP_P016
) )
848 return AlphaBltAnv12_P010(pSrc
, pDst
, pTarget
);
850 else if( src_type
==MSP_RGBA
&& (dst_type
== MSP_IYUV
||
851 dst_type
== MSP_YV12
))
853 return AlphaBltAxyuAxyv_Yv12(pSrc
, pDst
, pTarget
);
855 else if( src_type
==MSP_RGBA
&& (dst_type
== MSP_NV12
||
856 dst_type
== MSP_NV21
))
858 return AlphaBltAxyuAxyv_Nv12(pSrc
, pDst
, pTarget
);
860 else if( src_type
==MSP_RGBA
&& (dst_type
== MSP_P010
||
861 dst_type
== MSP_P016
))
863 return AlphaBltAxyuAxyv_P010(pSrc
, pDst
, pTarget
);
868 STDMETHODIMP
CMemSubPic::AlphaBltOther(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
870 const SubPicDesc
& src
= m_spd
;
871 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
873 CRect
rs(*pSrc
), rd(*pDst
);
877 rd
.bottom
= dst
.h
- rd
.bottom
;
878 rd
.top
= dst
.h
- rd
.top
;
880 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
883 int w
= rs
.Width(), h
= rs
.Height();
884 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);//rs.left*4
885 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*rd
.top
+ ((rd
.left
*dst
.bpp
)>>3);
886 if(rd
.top
> rd
.bottom
)
888 if(dst
.type
== MSP_RGB32
|| dst
.type
== MSP_RGB24
889 || dst
.type
== MSP_RGB16
|| dst
.type
== MSP_RGB15
890 || dst
.type
== MSP_YUY2
|| dst
.type
== MSP_AYUV
)
892 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + (rd
.left
*dst
.bpp
>>3);
894 else if(dst
.type
== MSP_YV12
|| dst
.type
== MSP_IYUV
)
896 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + (rd
.left
*8>>3);
902 dst
.pitch
= -dst
.pitch
;
904 DbgLog((LOG_TRACE
, 5, TEXT("w=%d h=%d"), w
, h
));
908 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
911 BYTE
* s2end
= s2
+ w
*4;
912 DWORD
* d2
= (DWORD
*)d
;
913 for(; s2
< s2end
; s2
+= 4, d2
++)
917 DWORD bd
=0x00000100 -( (DWORD
) s2
[3]);
918 DWORD B
= ((*((DWORD
*)s2
)&0x000000ff)<<8)/bd
;
919 DWORD V
= ((*((DWORD
*)s2
)&0x0000ff00)/bd
)<<8;
920 DWORD R
= (((*((DWORD
*)s2
)&0x00ff0000)>>8)/bd
)<<16;
922 | (0xff000000-(*((DWORD
*)s2
)&0xff000000))&0xff000000;
928 case MSP_AYUV
: //ToDo: fix me MSP_VUYA indeed?
929 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
932 BYTE
* s2end
= s2
+ w
*4;
933 DWORD
* d2
= (DWORD
*)d
;
934 for(; s2
< s2end
; s2
+= 4, d2
++)
937 DWORD ia
= 256-s2
[3];
939 *d2
= ((((*d2
&0x00ff00ff)*s2
[3])>>8) + (((*((DWORD
*)s2
)&0x00ff00ff)*ia
)>>8)&0x00ff00ff)
940 | ((((*d2
&0x0000ff00)*s2
[3])>>8) + (((*((DWORD
*)s2
)&0x0000ff00)*ia
)>>8)&0x0000ff00);
945 *d2
= (((((*d2
&0x00ff00ff)*s2
[3])>>8) + (*((DWORD
*)s2
)&0x00ff00ff))&0x00ff00ff)
946 | (((((*d2
&0x0000ff00)*s2
[3])>>8) + (*((DWORD
*)s2
)&0x0000ff00))&0x0000ff00);
953 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
956 BYTE
* s2end
= s2
+ w
*4;
958 for(; s2
< s2end
; s2
+= 4, d2
+= 3)
962 d2
[0] = ((d2
[0]*s2
[3])>>8) + s2
[0];
963 d2
[1] = ((d2
[1]*s2
[3])>>8) + s2
[1];
964 d2
[2] = ((d2
[2]*s2
[3])>>8) + s2
[2];
970 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
973 BYTE
* s2end
= s2
+ w
*4;
975 for(; s2
< s2end
; s2
+= 4, d2
++)
979 *d2
= (WORD
)((((((*d2
&0xf81f)*s2
[3])>>5) + (*(DWORD
*)s2
&0xf81f))&0xf81f)
980 | (((((*d2
&0x07e0)*s2
[3])>>5) + (*(DWORD
*)s2
&0x07e0))&0x07e0));
981 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
982 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
983 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
990 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
993 BYTE
* s2end
= s2
+ w
*4;
995 for(; s2
< s2end
; s2
+= 4, d2
++)
999 *d2
= (WORD
)((((((*d2
&0x7c1f)*s2
[3])>>5) + (*(DWORD
*)s2
&0x7c1f))&0x7c1f)
1000 | (((((*d2
&0x03e0)*s2
[3])>>5) + (*(DWORD
*)s2
&0x03e0))&0x03e0));
1001 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
1002 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
1003 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
1010 for(int j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
)
1014 BYTE
* s2end
= s2
+ w
*4;
1015 DWORD
* d2
= (DWORD
*)d
;
1016 for(; s2
< s2end
; s2
+= 8, d2
++)
1018 ia
= (s2
[3]+s2
[7])>>1;
1021 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
1022 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
1023 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
1024 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
1025 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
1027 ia
= (ia
<<24)|(s2
[7]<<16)|(ia
<<8)|s2
[3];
1028 c
= (s2
[4]<<24)|(s2
[5]<<16)|(s2
[0]<<8)|s2
[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
1039 psraw mm4
, 1 //or else, overflow because psraw shift in sign bit
1054 //dst.pitch = abs(dst.pitch);
1058 dst
.pitchUV
= abs(dst
.pitch
)/2;
1060 if(!dst
.bitsU
|| !dst
.bitsV
)
1062 dst
.bitsU
= (BYTE
*)dst
.bits
+ abs(dst
.pitch
)*dst
.h
;
1063 dst
.bitsV
= dst
.bitsU
+ dst
.pitchUV
*dst
.h
/2;
1064 if(dst
.type
== MSP_YV12
)
1066 BYTE
* p
= dst
.bitsU
;
1067 dst
.bitsU
= dst
.bitsV
;
1072 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1073 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1074 if(rd
.top
> rd
.bottom
)
1076 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1077 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1078 dst
.pitchUV
= -dst
.pitchUV
;
1081 BYTE
* src_origin
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ rs
.left
;
1084 ss
[0] = src_origin
+ src
.pitch
*src
.h
*2;//U
1085 ss
[1] = src_origin
+ src
.pitch
*src
.h
*3;//V
1087 AlphaBltYv12Luma( d
, dst
.pitch
, w
, h
, src_origin
+ src
.pitch
*src
.h
, src_origin
, src
.pitch
);
1089 AlphaBltYv12Chroma( dd
[0], dst
.pitchUV
, w
, h2
, ss
[0], src_origin
, src
.pitch
);
1090 AlphaBltYv12Chroma( dd
[1], dst
.pitchUV
, w
, h2
, ss
[1], src_origin
, src
.pitch
);
1100 //emmsÒª40¸öcpuÖÜÆÚ
1105 STDMETHODIMP
CMemSubPic::AlphaBltAxyuAxyv_P010(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1107 const SubPicDesc
& src
= m_spd
;
1108 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1110 CRect
rs(*pSrc
), rd(*pDst
);
1114 rd
.bottom
= dst
.h
- rd
.bottom
;
1115 rd
.top
= dst
.h
- rd
.top
;
1118 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1119 return E_INVALIDARG
;
1122 int w
= rs
.Width(), h
= rs
.Height();
1125 BYTE
* s
= static_cast<BYTE
*>(src
.bits
) + src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);
1126 BYTE
* d
= static_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*rd
.top
+ rd
.left
*2;
1128 if(rd
.top
> rd
.bottom
) {
1129 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + rd
.left
;
1131 dst
.pitch
= -dst
.pitch
;
1134 for(ptrdiff_t i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
1137 BYTE
* s2end
= s2
+ w
*4;
1138 WORD
* d2
= reinterpret_cast<WORD
*>(d
);
1139 for(; s2
< s2end
; s2
+= 4, d2
++)
1142 d2
[0] = ((d2
[0]*s2
[3])>>8) + (s2
[1]<<8);
1151 dst
.pitchUV
= abs(dst
.pitch
);
1153 if(!dst
.bitsU
|| !dst
.bitsV
)
1155 dst
.bitsU
= static_cast<BYTE
*>(dst
.bits
) + abs(dst
.pitch
)*dst
.h
;
1156 dst
.bitsV
= dst
.bitsU
+ 2;
1158 BYTE
* ddUV
= dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
*2;
1159 if(rd
.top
> rd
.bottom
)
1161 ddUV
= dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
*2;
1162 dst
.pitchUV
= -dst
.pitchUV
;
1165 s
= static_cast<BYTE
*>(src
.bits
) + src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);
1168 int pitch
= src
.pitch
;
1169 for(int j
= 0; j
< h2
; j
++, s
+= 2*src
.pitch
, d
+= dst
.pitchUV
)
1172 WORD
* d2
=reinterpret_cast<WORD
*>(d
);
1173 WORD
* d2_end
= reinterpret_cast<WORD
*>(d
+2*w
);
1174 for( ; d2
<d2_end
; s2
+=8, d2
+=2)
1178 s2
[3+src
.pitch
]+s2
[3+4+src
.pitch
]);
1181 d2
[0] = (((d2
[0])*ia
)>>10) + ((s2
[0] + s2
[0+src
.pitch
])<<7);
1182 d2
[1] = (((d2
[1])*ia
)>>10) + ((s2
[4] + s2
[4+src
.pitch
])<<7);
1190 STDMETHODIMP
CMemSubPic::AlphaBltAxyuAxyv_Yv12(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1192 const SubPicDesc
& src
= m_spd
;
1193 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1195 CRect
rs(*pSrc
), rd(*pDst
);
1199 rd
.bottom
= dst
.h
- rd
.bottom
;
1200 rd
.top
= dst
.h
- rd
.top
;
1203 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1204 return E_INVALIDARG
;
1207 int w
= rs
.Width(), h
= rs
.Height();
1209 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);
1210 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*rd
.top
+ rd
.left
;
1212 if(rd
.top
> rd
.bottom
) {
1213 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + rd
.left
;
1215 dst
.pitch
= -dst
.pitch
;
1218 for(ptrdiff_t j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
) {
1220 BYTE
* s2end
= s2
+ w
*4;
1222 for(; s2
< s2end
; s2
+= 4, d2
++) {
1224 d2
[0] = ((d2
[0]*s2
[3])>>8) + s2
[1];
1228 dst
.pitch
= abs(dst
.pitch
);
1233 dst
.pitchUV
= dst
.pitch
/2;
1237 ss
[0] = (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ rs
.left
*4;
1240 if(!dst
.bitsU
|| !dst
.bitsV
) {
1241 dst
.bitsU
= (BYTE
*)dst
.bits
+ dst
.pitch
*dst
.h
;
1242 dst
.bitsV
= dst
.bitsU
+ dst
.pitchUV
*dst
.h
/2;
1244 if(dst
.type
== MSP_YV12
) {
1245 BYTE
* p
= dst
.bitsU
;
1246 dst
.bitsU
= dst
.bitsV
;
1252 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1253 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
/2;
1255 if(rd
.top
> rd
.bottom
) {
1256 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1257 dd
[1] = dst
.bitsV
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
/2;
1258 dst
.pitchUV
= -dst
.pitchUV
;
1261 for(ptrdiff_t i
= 0; i
< 2; i
++) {
1265 for(ptrdiff_t j
= 0; j
< h2
; j
++, s
+= src
.pitch
*2, d
+= dst
.pitchUV
, is
+= src
.pitch
*2) {
1267 BYTE
* s2end
= s2
+ w
*4;
1270 for(; s2
< s2end
; s2
+= 8, d2
++, is2
+= 8) {
1271 unsigned int ia
= (s2
[3]+s2
[3+src
.pitch
]+is2
[3]+is2
[3+src
.pitch
])>>2;
1273 *d2
= ((*d2
*ia
)>>8) + ((s2
[0]+s2
[src
.pitch
])>>1);
1282 STDMETHODIMP
CMemSubPic::AlphaBltAxyuAxyv_Nv12(const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1284 const SubPicDesc
& src
= m_spd
;
1285 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1287 CRect
rs(*pSrc
), rd(*pDst
);
1291 rd
.bottom
= dst
.h
- rd
.bottom
;
1292 rd
.top
= dst
.h
- rd
.top
;
1295 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1296 return E_INVALIDARG
;
1299 int w
= rs
.Width(), h
= rs
.Height();
1301 BYTE
* s
= (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ ((rs
.left
*src
.bpp
)>>3);
1302 BYTE
* d
= (BYTE
*)dst
.bits
+ dst
.pitch
*rd
.top
+ rd
.left
;
1304 if(rd
.top
> rd
.bottom
) {
1305 d
= (BYTE
*)dst
.bits
+ dst
.pitch
*(rd
.top
-1) + rd
.left
;
1307 dst
.pitch
= -dst
.pitch
;
1310 for(ptrdiff_t j
= 0; j
< h
; j
++, s
+= src
.pitch
, d
+= dst
.pitch
) {
1312 BYTE
* s2end
= s2
+ w
*4;
1314 for(; s2
< s2end
; s2
+= 4, d2
++) {
1316 d2
[0] = ((d2
[0]*s2
[3])>>8) + s2
[1];
1320 dst
.pitch
= abs(dst
.pitch
);
1325 dst
.pitchUV
= dst
.pitch
;
1329 ss
[0] = (BYTE
*)src
.bits
+ src
.pitch
*rs
.top
+ rs
.left
*4;
1332 if(!dst
.bitsU
|| !dst
.bitsV
) {
1333 dst
.bitsU
= (BYTE
*)dst
.bits
+ dst
.pitch
*dst
.h
;
1334 dst
.bitsV
= dst
.bitsU
+ 1;
1336 if(dst
.type
== MSP_NV21
) {
1337 BYTE
* p
= dst
.bitsU
;
1338 dst
.bitsU
= dst
.bitsV
;
1344 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
;
1347 if(rd
.top
> rd
.bottom
) {
1348 dd
[0] = dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
;
1350 dst
.pitchUV
= -dst
.pitchUV
;
1353 for(ptrdiff_t i
= 0; i
< 2; i
++) {
1357 for(ptrdiff_t j
= 0; j
< h2
; j
++, s
+= src
.pitch
*2, d
+= dst
.pitchUV
, is
+= src
.pitch
*2) {
1359 BYTE
* s2end
= s2
+ w
*4;
1362 for(; s2
< s2end
; s2
+= 8, d2
+=2, is2
+= 8) {
1363 unsigned int ia
= (s2
[3]+s2
[3+src
.pitch
]+is2
[3]+is2
[3+src
.pitch
])>>2;
1365 *d2
= ((*d2
*ia
)>>8) + ((s2
[0]+s2
[src
.pitch
])>>1);
1374 STDMETHODIMP
CMemSubPic::AlphaBltAnv12_P010( const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1376 //fix me: check colorspace and log error
1377 const SubPicDesc
& src
= m_spd
;
1378 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1380 CRect
rs(*pSrc
), rd(*pDst
);
1384 rd
.bottom
= dst
.h
- rd
.bottom
;
1385 rd
.top
= dst
.h
- rd
.top
;
1387 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1388 return E_INVALIDARG
;
1390 int w
= rs
.Width(), h
= rs
.Height();
1391 bool bottom_down
= rd
.top
> rd
.bottom
;
1393 BYTE
* d
= reinterpret_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*rd
.top
+ rd
.left
*2;
1396 d
= reinterpret_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*(rd
.top
-1) + rd
.left
*2;
1397 dst
.pitch
= -dst
.pitch
;
1400 //dst.pitch = abs(dst.pitch);
1404 dst
.pitchUV
= abs(dst
.pitch
);
1406 dst
.bitsU
= reinterpret_cast<BYTE
*>(dst
.bits
) + abs(dst
.pitch
)*dst
.h
;
1407 BYTE
* ddUV
= dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
*2;
1410 ddUV
= dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
*2;
1411 dst
.pitchUV
= -dst
.pitchUV
;
1414 BYTE
* src_origin
= reinterpret_cast<BYTE
*>(src
.bits
) + src
.pitch
*rs
.top
+ rs
.left
;
1415 BYTE
*s
= src_origin
;
1418 // if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
1419 // && (reinterpret_cast<intptr_t>(d2)&15)==0 )
1420 if( ((reinterpret_cast<intptr_t>(s
) | static_cast<intptr_t>(src
.pitch
) |
1421 reinterpret_cast<intptr_t>(d
) | static_cast<intptr_t>(dst
.pitch
) ) & 15 )==0 )
1423 for(int i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
1426 BYTE
* s2
= s
+ src
.pitch
*src
.h
;
1427 BYTE
* s2end_mod16
= s2
+ (w
&~15);
1428 BYTE
* s2end
= s2
+ w
;
1431 for(; s2
< s2end_mod16
; s2
+=16, sa
+=16, d2
+=32)
1433 mix_16_y_p010_sse2(d2
, s2
, sa
);
1435 for( WORD
* d3
=reinterpret_cast<WORD
*>(d2
); s2
< s2end
; s2
++, sa
++, d3
++)
1439 d2
[0] = ((d2
[0]*sa
[0])>>8) + (s2
[0]<<8);
1444 else //fix me: only a workaround for non-mod-16 size video
1446 for(int i
=0; i
<h
; i
++, s
+= src
.pitch
, d
+= dst
.pitch
)
1449 BYTE
* s2
= s
+ src
.pitch
*src
.h
;
1450 BYTE
* s2end_mod16
= s2
+ (w
&~15);
1451 BYTE
* s2end
= s2
+ w
;
1452 WORD
* d2
= reinterpret_cast<WORD
*>(d
);
1453 for(; s2
< s2end
; s2
+=1, sa
+=1, d2
+=1)
1457 d2
[0] = ((d2
[0]*sa
[0])>>8) + (s2
[0]<<8);
1464 BYTE
* sa
= src_origin
;
1465 BYTE
* s_uv
= src_origin
+ src
.pitch
*src
.h
*2;//UV
1466 if( ((reinterpret_cast<intptr_t>(sa
) | static_cast<intptr_t>(src
.pitch
) |
1467 reinterpret_cast<intptr_t>(d
) | static_cast<intptr_t>(dst
.pitch
) ) & 15 )==0 )
1469 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
1473 BYTE
* s_u2end_mod16
= s_u2
+ (w
&~15);
1474 BYTE
* s_u2end
= s_u2
+ w
;
1477 for(; s_u2
< s_u2end_mod16
; s_u2
+=16, sa2
+=16, d2
+=32)
1479 mix_16_uv_p010_sse2(d2
, s_u2
, sa2
, src
.pitch
);
1482 for( WORD
* d3
=reinterpret_cast<WORD
*>(d2
); s_u2
< s_u2end
; s_u2
+=2, sa2
+=2, d3
+=2)
1486 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
]);
1489 d3
[0] = (((d3
[0])*ia
)>>10) + (s_u2
[0]<<8);
1490 d3
[1] = (((d3
[1])*ia
)>>10) + (s_u2
[1]<<8);
1497 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
1501 BYTE
* s_u2end
= s_u2
+ w
;
1504 for( WORD
* d3
=reinterpret_cast<WORD
*>(d2
); s_u2
< s_u2end
; s_u2
+=2, sa2
+=2, d3
+=2)
1508 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
]);
1511 d3
[0] = (((d3
[0])*ia
)>>10) + (s_u2
[0]<<8);
1512 d3
[1] = (((d3
[1])*ia
)>>10) + (s_u2
[1]<<8);
1520 STDMETHODIMP
CMemSubPic::AlphaBltAnv12_Nvxx( const RECT
* pSrc
, const RECT
* pDst
, SubPicDesc
* pTarget
)
1522 //fix me: check colorspace and log error
1523 const SubPicDesc
& src
= m_spd
;
1524 SubPicDesc dst
= *pTarget
; // copy, because we might modify it
1526 CRect
rs(*pSrc
), rd(*pDst
);
1530 rd
.bottom
= dst
.h
- rd
.bottom
;
1531 rd
.top
= dst
.h
- rd
.top
;
1533 if(rs
.Width() != rd
.Width() || rs
.Height() != abs(rd
.Height())) {
1534 return E_INVALIDARG
;
1536 int w
= rs
.Width(), h
= rs
.Height();
1537 bool bottom_down
= rd
.top
> rd
.bottom
;
1539 BYTE
* d
= reinterpret_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*rd
.top
+ rd
.left
;
1542 d
= reinterpret_cast<BYTE
*>(dst
.bits
) + dst
.pitch
*(rd
.top
-1) + rd
.left
;
1543 dst
.pitch
= -dst
.pitch
;
1546 //dst.pitch = abs(dst.pitch);
1550 dst
.pitchUV
= abs(dst
.pitch
);
1554 dst
.bitsU
= reinterpret_cast<BYTE
*>(dst
.bits
) + abs(dst
.pitch
)*dst
.h
;
1556 BYTE
* ddUV
= dst
.bitsU
+ dst
.pitchUV
*rd
.top
/2 + rd
.left
;
1559 ddUV
= dst
.bitsU
+ dst
.pitchUV
*(rd
.top
/2-1) + rd
.left
;
1560 dst
.pitchUV
= -dst
.pitchUV
;
1563 BYTE
* sa
= reinterpret_cast<BYTE
*>(src
.bits
) + src
.pitch
*rs
.top
+ rs
.left
;
1565 BYTE
* s_uv
= sa
+ src
.pitch
*src
.h
*2;//UV
1567 AlphaBltYv12Luma( d
, dst
.pitch
, w
, h
, sa
+ src
.pitch
*src
.h
, sa
, src
.pitch
);
1568 if( ((reinterpret_cast<intptr_t>(sa
) | static_cast<intptr_t>(src
.pitch
) |
1569 reinterpret_cast<intptr_t>(ddUV
) | static_cast<intptr_t>(dst
.pitchUV
) ) & 15 )==0 )
1572 int pitch
= src
.pitch
;
1573 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
1577 BYTE
* s_u2end_mod16
= s_u2
+ (w
&~15);
1578 BYTE
* s_u2end
= s_u2
+ w
;
1581 for(; s_u2
< s_u2end_mod16
; s_u2
+=16, sa2
+=16, d2
+=16)
1583 mix_16_uv_nvxx_sse2(d2
, s_u2
, sa2
, src
.pitch
);
1585 for( BYTE
* d3
=d2
; s_u2
< s_u2end
; s_u2
+=2, sa2
+=2, d3
+=2)
1589 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
]);
1592 d3
[0] = (((d3
[0])*ia
)>>10) + s_u2
[0];
1593 d3
[1] = (((d3
[1])*ia
)>>10) + s_u2
[1];
1601 int pitch
= src
.pitch
;
1602 for(int j
= 0; j
< h2
; j
++, s_uv
+= src
.pitch
, sa
+= src
.pitch
*2, d
+= dst
.pitchUV
)
1606 BYTE
* s_u2end_mod16
= s_u2
+ (w
&~15);
1607 BYTE
* s_u2end
= s_u2
+ w
;
1610 for( BYTE
* d3
=d2
; s_u2
< s_u2end
; s_u2
+=2, sa2
+=2, d3
+=2)
1614 sa2
[0+src
.pitch
]+sa2
[1+src
.pitch
]);
1617 d3
[0] = (((d3
[0])*ia
)>>10) + s_u2
[0];
1618 d3
[1] = (((d3
[1])*ia
)>>10) + s_u2
[1];
1627 STDMETHODIMP
CMemSubPic::SetDirtyRectEx(CAtlList
<CRect
>* dirtyRectList
)
1629 //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1630 if(dirtyRectList
!=NULL
)
1632 POSITION pos
= dirtyRectList
->GetHeadPosition();
1633 if(m_spd
.type
== MSP_AYUV_PLANAR
|| m_alpha_blt_dst_type
==MSP_IYUV
|| m_alpha_blt_dst_type
==MSP_YV12
1634 || m_alpha_blt_dst_type
==MSP_P010
|| m_alpha_blt_dst_type
==MSP_P016
1635 || m_alpha_blt_dst_type
==MSP_NV12
|| m_alpha_blt_dst_type
==MSP_NV21
)
1639 CRect
& cRectSrc
= dirtyRectList
->GetNext(pos
);
1640 cRectSrc
.left
&= ~15;
1641 cRectSrc
.right
= (cRectSrc
.right
+15)&~15;
1642 if(cRectSrc
.right
>m_spd
.w
)
1644 cRectSrc
.right
= m_spd
.w
;
1647 cRectSrc
.bottom
= (cRectSrc
.bottom
+1)&~1;
1650 else if(m_spd
.type
== MSP_XY_AUYV
|| m_alpha_blt_dst_type
==MSP_YUY2
)
1654 CRect
& cRectSrc
= dirtyRectList
->GetNext(pos
);
1655 cRectSrc
.left
&= ~3;
1656 cRectSrc
.right
= (cRectSrc
.right
+3)&~3;
1660 return __super::SetDirtyRectEx(dirtyRectList
);
1664 // CMemSubPicAllocator
1667 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type
, SIZE maxsize
, int type
/*=-1*/)
1668 : CSubPicExAllocatorImpl(maxsize
, false, false)
1669 , m_alpha_blt_dst_type(alpha_blt_dst_type
)
1670 , m_maxsize(maxsize
)
1675 switch(alpha_blt_dst_type
)
1678 m_type
= MSP_XY_AUYV
;
1689 m_type
= MSP_AYUV_PLANAR
;
1698 // ISubPicAllocatorImpl
1700 bool CMemSubPicAllocator::AllocEx(bool fStatic
, ISubPicEx
** ppSubPic
)
1706 spd
.w
= m_maxsize
.cx
;
1707 spd
.h
= m_maxsize
.cy
;
1709 spd
.pitch
= (spd
.w
*spd
.bpp
)>>3;
1711 spd
.bits
= DNew BYTE
[spd
.pitch
*spd
.h
];
1715 *ppSubPic
= DNew
CMemSubPic(spd
, m_alpha_blt_dst_type
);
1719 (*ppSubPic
)->AddRef();