src/subpic/MemSubPic.cpp

   1 /*
   2 *       Copyright (C) 2003-2006 Gabest
   3 *       http://www.gabest.org
   4 *
   5 *  This Program is free software; you can redistribute it and/or modify
   6 *  it under the terms of the GNU General Public License as published by
   7 *  the Free Software Foundation; either version 2, or (at your option)
   8 *  any later version.
   9 *
  10 *  This Program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU General Public License
  16 *  along with GNU Make; see the file COPYING.  If not, write to
  17 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18 *  http://www.gnu.org/copyleft/gpl.html
  19 *
  20 */
  21
  22 #include "stdafx.h"
  23 #include "MemSubPic.h"
  24 #include "color_conv_table.h"
  25
  26 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
  27     m128_1 = _mm_avg_epu8(m128_1, m128_2); \
  28     m128_2 = _mm_slli_epi16(m128_1, 8); \
  29     m128_1 = _mm_srli_epi16(m128_1, 8); \
  30     m128_2 = _mm_srli_epi16(m128_2, 8); \
  31     m128_1 = _mm_avg_epu8(m128_1, m128_2);
  32
  33 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
  34     {\
  35     m128_1 = _mm_avg_epu8(m128_1, m128_2); \
  36     m128_2 = _mm_slli_epi16(m128_1, 8); \
  37     __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
  38     m128_2 = _mm_or_si128(m128_2, m128_3);\
  39     m128_1 = _mm_avg_epu8(m128_1, m128_2);\
  40     }
  41
  42 void subsample_and_interlace_2_line_c(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
  43 {
  44     const BYTE* end = u + w;
  45     for (;u<end;dst+=2,u+=2,v+=2)
  46     {
  47         dst[0] = (u[0] + u[0+pitch] + 1)/2;
  48         int tmp1 = (u[1] + u[1+pitch] + 1)/2;
  49         dst[0] = (dst[0] + tmp1 + 1)/2;
  50         dst[1] = (v[0] + v[0+pitch] + 1)/2;
  51         tmp1 = (v[1] + v[1+pitch] + 1)/2;
  52         dst[1] = (dst[1] + tmp1 + 1)/2;
  53     }
  54 }
  55
  56 __forceinline void subsample_and_interlace_2_line_sse2(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
  57 {
  58     const BYTE* end = u + w;
  59     for (;u<end;dst+=16,u+=16,v+=16)
  60     {
  61         __m128i u_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(u) );
  62         __m128i u_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(u+pitch) );
  63         __m128i v_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(v) );
  64         __m128i v_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(v+pitch) );
  65         AVERAGE_4_PIX_INTRINSICS(u_1, u_2);
  66         AVERAGE_4_PIX_INTRINSICS(v_1, v_2);
  67         u_1 = _mm_packus_epi16(u_1, u_1);
  68         v_1 = _mm_packus_epi16(v_1, v_1);
  69         u_1 = _mm_unpacklo_epi8(u_1, v_1);
  70
  71         _mm_store_si128( reinterpret_cast<__m128i*>(dst), u_1 );
  72     }
  73 }
  74
  75 static __forceinline void pix_alpha_blend_yv12_luma_sse2(byte* dst, const byte* alpha, const byte* sub)
  76 {
  77     __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
  78     __m128i alpha128 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
  79     __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(sub) );
  80     __m128i zero = _mm_setzero_si128();
  81
  82     __m128i ones = _mm_cmpeq_epi32(ones,ones);
  83     ones = _mm_cmpeq_epi8(ones,alpha128);
  84
  85     __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
  86     __m128i alpha_lo128 = _mm_unpacklo_epi8(alpha128, zero);
  87
  88     __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
  89
  90     dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha_lo128);
  91     dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
  92     dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
  93
  94     dst128 = _mm_unpackhi_epi8(dst128, zero);
  95     alpha128 = _mm_unpackhi_epi8(alpha128, zero);
  96
  97     ones2 = _mm_unpackhi_epi8(ones, zero);
  98
  99     dst128 = _mm_mullo_epi16(dst128, alpha128);
 100     dst128 = _mm_adds_epu16(dst128, ones2);
 101     dst128 = _mm_srli_epi16(dst128, 8);
 102     dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
 103
 104     dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
 105     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
 106 }
 107
 108 /***
 109  * output not exactly identical to pix_alpha_blend_yv12_chroma
 110  */
 111 static __forceinline void pix_alpha_blend_yv12_chroma_sse2(byte* dst, const byte* src, const byte* alpha, int src_pitch)
 112 {
 113     __m128i zero = _mm_setzero_si128();
 114     __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
 115     __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha+src_pitch) );
 116     __m128i dst128 = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(dst) );
 117
 118     __m128i sub128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
 119     __m128i sub128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src+src_pitch) );
 120
 121     AVERAGE_4_PIX_INTRINSICS(alpha128_1, alpha128_2);
 122
 123     __m128i ones = _mm_cmpeq_epi32(ones, ones);
 124     ones = _mm_cmpeq_epi8(ones, alpha128_1);
 125
 126     dst128 = _mm_unpacklo_epi8(dst128, zero);
 127     __m128i dst128_2 = _mm_and_si128(dst128, ones);
 128
 129     dst128 = _mm_mullo_epi16(dst128, alpha128_1);
 130     dst128 = _mm_adds_epu16(dst128, dst128_2);
 131
 132     dst128 = _mm_srli_epi16(dst128, 8);
 133
 134     AVERAGE_4_PIX_INTRINSICS(sub128_1, sub128_2);
 135
 136     dst128 = _mm_adds_epi16(dst128, sub128_1);
 137     dst128 = _mm_packus_epi16(dst128, dst128);
 138
 139     _mm_storel_epi64( reinterpret_cast<__m128i*>(dst), dst128 );
 140 }
 141
 142 static __forceinline void pix_alpha_blend_yv12_chroma(byte* dst, const byte* src, const byte* alpha, int src_pitch)
 143 {
 144     unsigned int ia = (alpha[0]+alpha[1]+
 145         alpha[0+src_pitch]+alpha[1+src_pitch])>>2;
 146     if(ia!=0xff)
 147     {
 148         *dst= (((*dst)*ia)>>8) + ((src[0]        +src[1]+
 149             src[src_pitch]+src[1+src_pitch] )>>2);
 150     }
 151 }
 152
 153 static void AlphaBltYv12Luma(byte* dst, int dst_pitch,
 154     int w, int h,
 155     const byte* sub, const byte* alpha, int sub_pitch)
 156 {
 157     if( ((reinterpret_cast<intptr_t>(alpha) | static_cast<intptr_t>(sub_pitch) |
 158         reinterpret_cast<intptr_t>(dst) | static_cast<intptr_t>(dst_pitch) ) & 15 )==0 )
 159     {
 160         for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
 161         {
 162             const BYTE* sa = alpha;
 163             const BYTE* s2 = sub;
 164             const BYTE* s2end_mod16 = s2 + (w&~15);
 165             const BYTE* s2end = s2 + w;
 166             BYTE* d2 = dst;
 167
 168             for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=16)
 169             {
 170                 pix_alpha_blend_yv12_luma_sse2(d2, sa, s2);
 171             }
 172             for(; s2 < s2end; s2++, sa++, d2++)
 173             {
 174                 if(sa[0] < 0xff)
 175                 {
 176                     d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
 177                 }
 178             }
 179         }
 180     }
 181     else //fix me: only a workaround for non-mod-16 size video
 182     {
 183         for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
 184         {
 185             const BYTE* sa = alpha;
 186             const BYTE* s2 = sub;
 187             const BYTE* s2end_mod16 = s2 + (w&~15);
 188             const BYTE* s2end = s2 + w;
 189             BYTE* d2 = dst;
 190             for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
 191             {
 192                 if(sa[0] < 0xff)
 193                 {
 194                     //                                  d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
 195                     d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
 196                 }
 197             }
 198         }
 199     }
 200 }
 201
 202 static void AlphaBltYv12Chroma(byte* dst, int dst_pitch,
 203     int w, int chroma_h,
 204     const byte* sub_chroma, const byte* alpha, int sub_pitch)
 205 {
 206     if( ((reinterpret_cast<intptr_t>(sub_chroma) |
 207         //reinterpret_cast<intptr_t>(dst) |
 208         reinterpret_cast<intptr_t>(alpha) | static_cast<intptr_t>(sub_pitch)
 209         //| (static_cast<intptr_t>(dst_pitch)&7)
 210         ) & 15 )==0 )
 211     {
 212         int pitch = sub_pitch;
 213         for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
 214         {
 215             const BYTE* s2 = sub_chroma;
 216             const BYTE* sa2 = alpha;
 217             const BYTE* s2end_mod16 = s2 + (w&~15);
 218             const BYTE* s2end = s2 + w;
 219             BYTE* d2 = dst;
 220
 221             for(; s2 < s2end_mod16; s2 += 16, sa2 += 16, d2+=8)
 222             {
 223                 pix_alpha_blend_yv12_chroma_sse2(d2, s2, sa2, sub_pitch);
 224             }
 225             for(; s2 < s2end; s2+=2, sa2+=2, d2++)
 226             {
 227                 pix_alpha_blend_yv12_chroma(d2, s2, sa2, sub_pitch);
 228             }
 229         }
 230     }
 231     else//fix me: only a workaround for non-mod-16 size video
 232     {
 233         for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
 234         {
 235             const BYTE* s2 = sub_chroma;
 236             const BYTE* sa2 = alpha;
 237             const BYTE* s2end_mod16 = s2 + (w&~15);
 238             const BYTE* s2end = s2 + w;
 239             BYTE* d2 = dst;
 240             for(; s2 < s2end; s2 += 2, sa2 += 2, d2++)
 241             {
 242                 pix_alpha_blend_yv12_chroma(d2, s2, sa2, sub_pitch);
 243             }
 244         }
 245     }
 246 }
 247
 248 __forceinline void mix_16_y_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
 249 {
 250     //important!
 251     __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
 252     __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
 253     __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 254
 255     __m128i alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
 256     alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
 257
 258     __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
 259     //so we do it another way
 260     //first, (alpha<<8)+0xff
 261     __m128i ones = _mm_setzero_si128();
 262     ones = _mm_cmpeq_epi16(dst_y, ones);
 263
 264     __m128i ones2 = _mm_cmpeq_epi32(ones2,ones2);
 265     ones = _mm_xor_si128(ones, ones2);
 266     ones = _mm_srli_epi16(ones, 15);
 267     ones = _mm_and_si128(ones, lo);
 268
 269     dst_y = _mm_mulhi_epu16(dst_y, lo);
 270     dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
 271
 272     lo = _mm_setzero_si128();
 273     lo = _mm_unpacklo_epi8(lo, src_y);
 274     dst_y = _mm_adds_epu16(dst_y, lo);
 275     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
 276
 277     dst += 16;
 278     dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 279
 280     lo = _mm_unpackhi_epi8(alpha_ff, alpha);
 281
 282     ones = _mm_setzero_si128();
 283     ones = _mm_cmpeq_epi16(dst_y, ones);
 284     ones = _mm_xor_si128(ones, ones2);
 285     ones = _mm_srli_epi16(ones, 15);
 286     ones = _mm_and_si128(ones, lo);
 287
 288     dst_y = _mm_mulhi_epu16(dst_y, lo);
 289     dst_y = _mm_adds_epu16(dst_y, ones);
 290
 291     lo = _mm_setzero_si128();
 292     lo = _mm_unpackhi_epi8(lo, src_y);
 293     dst_y = _mm_adds_epu16(dst_y, lo);
 294     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
 295 }
 296
 297 //for test only
 298 void mix_16_y_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
 299 {
 300     WORD* dst_word = reinterpret_cast<WORD*>(dst);
 301     for (int i=0;i<16;i++)
 302     {
 303         if (src_alpha[i]!=0xff)
 304         {
 305             dst_word[i] = ((dst_word[i] *src_alpha[i])>>8) + (src[i]<<8);
 306         }
 307     }
 308 }
 309
 310 __forceinline void mix_16_uv_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
 311 {
 312     //important!
 313     __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
 314     __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
 315
 316     __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
 317     __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 318
 319     AVERAGE_4_PIX_INTRINSICS_2(alpha, alpha2);
 320
 321     __m128i alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
 322     alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
 323
 324     __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
 325     //so we do it another way
 326     //first, (alpha<<8)+0xff
 327     __m128i ones = _mm_setzero_si128();
 328     ones = _mm_cmpeq_epi16(dst_y, ones);
 329
 330     __m128i ones2 = _mm_cmpeq_epi32(ones2,ones2);
 331     ones = _mm_xor_si128(ones, ones2);
 332     ones = _mm_srli_epi16(ones, 15);
 333     ones = _mm_and_si128(ones, lo);
 334
 335     dst_y = _mm_mulhi_epu16(dst_y, lo);
 336     dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
 337
 338     lo = _mm_setzero_si128();
 339     lo = _mm_unpacklo_epi8(lo, src_y);
 340     dst_y = _mm_adds_epu16(dst_y, lo);
 341     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
 342
 343     dst += 16;
 344     dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 345
 346     lo = _mm_unpackhi_epi8(alpha_ff, alpha);
 347
 348     ones = _mm_setzero_si128();
 349     ones = _mm_cmpeq_epi16(dst_y, ones);
 350     ones = _mm_xor_si128(ones, ones2);
 351     ones = _mm_srli_epi16(ones, 15);
 352     ones = _mm_and_si128(ones, lo);
 353
 354     dst_y = _mm_mulhi_epu16(dst_y, lo);
 355     dst_y = _mm_adds_epu16(dst_y, ones);
 356
 357     lo = _mm_setzero_si128();
 358     lo = _mm_unpackhi_epi8(lo, src_y);
 359     dst_y = _mm_adds_epu16(dst_y, lo);
 360     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
 361 }
 362
 363 //for test only
 364 void mix_16_uv_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
 365 {
 366     WORD* dst_word = reinterpret_cast<WORD*>(dst);
 367     for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst_word+=2)
 368     {
 369         unsigned int ia = (
 370             (src_alpha[0]+src_alpha[0+pitch]+1)/2+
 371             (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
 372         if( ia!=0xFF )
 373         {
 374             int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);
 375             if(tmp>0xffff) tmp = 0xffff;
 376             dst_word[0] = tmp;
 377             tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);
 378             if(tmp>0xffff) tmp = 0xffff;
 379             dst_word[1] = tmp;
 380         }
 381     }
 382 }
 383
 384 __forceinline void mix_16_uv_nvxx_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
 385 {
 386     __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 387     __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
 388     __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
 389     __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
 390
 391     AVERAGE_4_PIX_INTRINSICS_2(alpha128_1, alpha128_2);
 392     __m128i zero = _mm_setzero_si128();
 393
 394     __m128i ones = _mm_cmpeq_epi32(ones,ones);
 395     ones = _mm_cmpeq_epi8(ones,alpha128_1);
 396
 397     __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
 398     alpha128_2 = _mm_unpacklo_epi8(alpha128_1, zero);
 399
 400     __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
 401
 402     dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha128_2);
 403     dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
 404     dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
 405
 406     dst128 = _mm_unpackhi_epi8(dst128, zero);
 407     alpha128_1 = _mm_unpackhi_epi8(alpha128_1, zero);
 408
 409     ones2 = _mm_unpackhi_epi8(ones, zero);
 410
 411     dst128 = _mm_mullo_epi16(dst128, alpha128_1);
 412     dst128 = _mm_adds_epu16(dst128, ones2);
 413     dst128 = _mm_srli_epi16(dst128, 8);
 414     dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
 415
 416     dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
 417     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
 418 }
 419
 420 //for test only
 421 void mix_16_uv_nvxx_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
 422 {
 423     for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst+=2)
 424     {
 425         unsigned int ia = (
 426             (src_alpha[0]+src_alpha[0+pitch]+1)/2+
 427             (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
 428         if( ia!=0xFF )
 429         {
 430             dst[0] = (((dst[0])*ia)>>8) + src[0];
 431             dst[1] = (((dst[1])*ia)>>8) + src[1];
 432         }
 433     }
 434 }
 435
 436 //
 437 // CMemSubPic
 438 //
 439
 440 CMemSubPic::CMemSubPic(SubPicDesc& spd, int alpha_blt_dst_type)
 441     : m_spd(spd), m_alpha_blt_dst_type(alpha_blt_dst_type)
 442 {
 443     m_maxsize.SetSize(spd.w, spd.h);
 444     //  m_rcDirty.SetRect(0, 0, spd.w, spd.h);
 445     CRect allSpd(0,0,spd.w, spd.h);
 446     m_rectListDirty.AddTail(allSpd);
 447 }
 448
 449 CMemSubPic::~CMemSubPic()
 450 {
 451     delete [] m_spd.bits, m_spd.bits = NULL;
 452 }
 453
 454 // ISubPic
 455
 456 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
 457 {
 458     return (void*)&m_spd;
 459 }
 460
 461 STDMETHODIMP CMemSubPic::GetDesc(SubPicDesc& spd) const
 462 {
 463     spd.type = m_spd.type;
 464     spd.w = m_size.cx;
 465     spd.h = m_size.cy;
 466     spd.bpp = m_spd.bpp;
 467     spd.pitch = m_spd.pitch;
 468     spd.bits = m_spd.bits;
 469     spd.bitsU = m_spd.bitsU;
 470     spd.bitsV = m_spd.bitsV;
 471     spd.vidrect = m_vidrect;
 472     return S_OK;
 473 }
 474
 475 STDMETHODIMP CMemSubPic::CopyTo(ISubPicEx* pSubPic)
 476 {
 477     HRESULT hr;
 478         if(FAILED(hr = __super::CopyTo(pSubPic))) {
 479         return hr;
 480         }
 481
 482         SubPicDesc src, dst;
 483         if(FAILED(GetDesc(src)) || FAILED(pSubPic->GetDesc(dst))) {
 484         return E_FAIL;
 485     }
 486     while(!m_rectListDirty.IsEmpty())
 487     {
 488         CRect& cRect = m_rectListDirty.GetHead();
 489         int w = cRect.Width(), h = cRect.Height();
 490         BYTE* s = (BYTE*)src.bits + src.pitch*cRect.top + cRect.left*4;
 491         BYTE* d = (BYTE*)dst.bits + dst.pitch*cRect.top + cRect.left*4;
 492         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 493             memcpy(d, s, w*4);
 494     }
 495     return S_OK;
 496 }
 497
 498 STDMETHODIMP CMemSubPic::ClearDirtyRect(DWORD color)
 499 {
 500     if(m_rectListDirty.IsEmpty()) {
 501         return S_OK;
 502         }
 503     while(!m_rectListDirty.IsEmpty())
 504     {
 505         //pDirtyRect = m_rectListDirty.RemoveHead();
 506         CRect& dirtyRect = m_rectListDirty.RemoveTail();
 507         BYTE* p = (BYTE*)m_spd.bits + m_spd.pitch*(dirtyRect.top) + dirtyRect.left*(m_spd.bpp>>3);
 508         int w = dirtyRect.Width();
 509         if(m_spd.type!=MSP_AYUV_PLANAR)
 510         {
 511             for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
 512             {
 513 #ifdef _WIN64
 514                                 memsetd(p, color, w*4); // nya
 515 #else
 516                 __asm
 517                 {
 518                         mov eax, color
 519                         mov ecx, w
 520                         mov edi, p
 521                         cld
 522                         rep stosd
 523                 }
 524
 525 #endif
 526             }
 527         }
 528         else
 529         {
 530             ///TODO:
 531             ///FIX ME
 532             for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
 533             {
 534                 //        memsetd(p, 0, m_rcDirty.Width());
 535                 //DbgLog((LOG_TRACE, 3, "w:%d", w));
 536                 //w = pDirtyRect->Width();
 537                 memset(p, 0xFF, w);
 538                 memset(p+m_spd.h*m_spd.pitch, 0, w);
 539                 memset(p+m_spd.h*m_spd.pitch*2, 0, w);
 540                 memset(p+m_spd.h*m_spd.pitch*3, 0, w);
 541             }
 542         }
 543     }
 544         m_rectListDirty.RemoveAll();
 545     return S_OK;
 546 }
 547
 548 STDMETHODIMP CMemSubPic::Lock(SubPicDesc& spd)
 549 {
 550     return GetDesc(spd);
 551 }
 552
 553 STDMETHODIMP CMemSubPic::Unlock( CAtlList<CRect>* dirtyRectList )
 554 {
 555     int src_type = m_spd.type;
 556     int dst_type = m_alpha_blt_dst_type;
 557     if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
 558                                 dst_type == MSP_RGB24 ||
 559                                 dst_type == MSP_RGB16 ||
 560                                 dst_type == MSP_RGB15))
 561         ||
 562         (src_type==MSP_XY_AUYV &&  dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
 563         ||
 564         (src_type==MSP_AYUV &&  dst_type == MSP_AYUV)
 565         ||
 566         (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
 567                                 dst_type == MSP_YV12 ||
 568                                 dst_type == MSP_P010 ||
 569                                 dst_type == MSP_P016 ||
 570                                 dst_type == MSP_NV12 ||
 571                                 dst_type == MSP_NV21)))
 572     {
 573         return UnlockOther(dirtyRectList);
 574     }
 575     else if(src_type==MSP_RGBA && (dst_type == MSP_YUY2 ||
 576                                    dst_type == MSP_AYUV || //ToDo: fix me MSP_AYUV
 577                                    dst_type == MSP_IYUV ||
 578                                    dst_type == MSP_YV12 ||
 579                                    dst_type == MSP_NV12 ||
 580                                    dst_type == MSP_NV21 ||
 581                                    dst_type == MSP_P010 ||
 582                                    dst_type == MSP_P016))
 583     {
 584         return UnlockRGBA_YUV(dirtyRectList);
 585     }
 586     return E_NOTIMPL;
 587 }
 588
 589 STDMETHODIMP CMemSubPic::UnlockOther(CAtlList<CRect>* dirtyRectList)
 590 {
 591     SetDirtyRectEx(dirtyRectList);
 592     if(m_rectListDirty.IsEmpty()) {
 593         return S_OK;
 594     }
 595
 596     POSITION pos = m_rectListDirty.GetHeadPosition();
 597     while(pos!=NULL)
 598     {
 599         const CRect& cRect = m_rectListDirty.GetNext(pos);
 600         int w = cRect.Width(), h = cRect.Height();
 601         BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*(cRect.top) + cRect.left*4;
 602         BYTE* bottom = top + m_spd.pitch*h;
 603         if(m_alpha_blt_dst_type == MSP_RGB16)
 604         {
 605             for(; top < bottom ; top += m_spd.pitch)
 606             {
 607                 DWORD* s = (DWORD*)top;
 608                 DWORD* e = s + w;
 609                 for(; s < e; s++)
 610                 {
 611                     *s = ((*s>>3)&0x1f000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
 612                     //                          *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
 613                 }
 614             }
 615         }
 616         else if(m_alpha_blt_dst_type == MSP_RGB15)
 617         {
 618             for(; top < bottom; top += m_spd.pitch)
 619             {
 620                 DWORD* s = (DWORD*)top;
 621                 DWORD* e = s + w;
 622                 for(; s < e; s++)
 623                 {
 624                     *s = ((*s>>3)&0x1f000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
 625                     //                          *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
 626                 }
 627             }
 628         }
 629         else if(m_alpha_blt_dst_type == MSP_YUY2)
 630         {
 631             XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top, m_spd.pitch*(h-1)) );
 632
 633             for(BYTE* tempTop=top; tempTop < bottom ; tempTop += m_spd.pitch)
 634             {
 635                 BYTE* s = tempTop;
 636                 BYTE* e = s + w*4;
 637                 for(; s < e; s+=8) // AUYV AUYV -> AxYU AxYV
 638                 {
 639                     s[4] = (s[0] + s[4])>>1;
 640                     s[0] = (s[2] + s[6])>>1;
 641                 }
 642             }
 643
 644             XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top, m_spd.pitch*(h-1)) );
 645         }
 646         else if(m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV || m_alpha_blt_dst_type == MSP_YV12 )
 647         {
 648             //nothing to do
 649         }
 650         else if ( m_alpha_blt_dst_type == MSP_P010 || m_alpha_blt_dst_type == MSP_P016
 651             || m_alpha_blt_dst_type == MSP_NV12 )
 652         {
 653             SubsampleAndInterlace(cRect, true);
 654         }
 655         else if( m_alpha_blt_dst_type == MSP_NV21 )
 656         {
 657             SubsampleAndInterlace(cRect, false);
 658         }
 659     }
 660     return S_OK;
 661 }
 662
 663 STDMETHODIMP CMemSubPic::UnlockRGBA_YUV(CAtlList<CRect>* dirtyRectList)
 664 {
 665     SetDirtyRectEx(dirtyRectList);
 666     if(m_rectListDirty.IsEmpty()) {
 667         return S_OK;
 668     }
 669
 670     const ColorConvTable *conv_table = ColorConvTable::GetDefaultColorConvTable();
 671     const int *c2y_yb = conv_table->c2y_yb;
 672     const int *c2y_yg = conv_table->c2y_yg;
 673     const int *c2y_yr = conv_table->c2y_yr;
 674     const int cy_cy2 = conv_table->cy_cy2;
 675     const int c2y_cu = conv_table->c2y_cu;
 676     const int c2y_cv = conv_table->c2y_cv;
 677     const int cy_cy = conv_table->cy_cy;
 678     const unsigned char* Clip = conv_table->Clip;
 679
 680     POSITION pos = m_rectListDirty.GetHeadPosition();
 681     while(pos!=NULL)
 682     {
 683         const CRect& cRect = m_rectListDirty.GetNext(pos);
 684         int w = cRect.Width(), h = cRect.Height();
 685
 686         BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*cRect.top + cRect.left*4;
 687         BYTE* bottom = top + m_spd.pitch*h;
 688
 689         if( m_alpha_blt_dst_type == MSP_YUY2 ||
 690             m_alpha_blt_dst_type == MSP_YV12 ||
 691             m_alpha_blt_dst_type == MSP_IYUV ||
 692             m_alpha_blt_dst_type == MSP_P010 ||
 693             m_alpha_blt_dst_type == MSP_P016 ||
 694             m_alpha_blt_dst_type == MSP_NV12 ||
 695             m_alpha_blt_dst_type == MSP_NV21) {
 696             for(; top < bottom ; top += m_spd.pitch) {
 697                 BYTE* s = top;
 698                 BYTE* e = s + w*4;
 699                 for(; s < e; s+=8) { // ARGB ARGB -> AxYU AxYV
 700                     if((s[3]+s[7]) < 0x1fe) {
 701                         int a = 0x200 - (s[3]+s[7]);
 702                         a <<= 7;
 703                         // 0 <= a <= 0x10000
 704                         s[1] = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a  + 0x8000) >> 16;
 705                         s[5] = (c2y_yb[s[4]] + c2y_yg[s[5]] + c2y_yr[s[6]] + 0x10*a  + 0x8000) >> 16;
 706
 707                         int scaled_y = (s[1]+s[5]-32) * cy_cy2;
 708
 709                         s[0] = Clip[(((((s[0]+s[4])<<15) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
 710                         s[4] = Clip[(((((s[2]+s[6])<<15) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
 711                     } else {
 712                         s[1] = s[5] = 0;
 713                         s[0] = s[4] = 0;
 714                     }
 715                 }
 716             }
 717         }
 718         else if(m_alpha_blt_dst_type == MSP_AYUV) {
 719             for(; top < bottom ; top += m_spd.pitch) {
 720                 BYTE* s = top;
 721                 BYTE* e = s + w*4;
 722                 for(; s < e; s+=4) { // ARGB -> AYUV
 723                     if(s[3] < 0xff) {
 724                         int a = 0x100 - s[3];
 725                         a <<= 8;
 726                         // 0 <= a <= 0x10000
 727
 728                         int y = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a + 0x8000) >> 16;
 729                         int scaled_y = (y-32) * cy_cy;
 730                         s[1] = Clip[((((s[0]<<16) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
 731                         s[0] = Clip[((((s[2]<<16) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
 732                         s[2] = y;
 733                     } else {
 734                         s[0] = s[1] = 0;
 735                         s[2] = 0;
 736                     }
 737                 }
 738             }
 739         }
 740     }
 741     return S_OK;
 742 }
 743
 744 void CMemSubPic::SubsampleAndInterlace( const CRect& cRect, bool u_first )
 745 {
 746     //fix me: check alignment and log error
 747     int w = cRect.Width(), h = cRect.Height();
 748     BYTE* u_plan = reinterpret_cast<BYTE*>(m_spd.bits) + m_spd.pitch*m_spd.h*2;
 749     BYTE* u_start = u_plan + m_spd.pitch*(cRect.top)+ cRect.left;
 750     BYTE* v_start = u_start + m_spd.pitch*m_spd.h;
 751     BYTE* dst = u_start;
 752     if(!u_first)
 753     {
 754         BYTE* tmp = v_start;
 755         v_start = u_start;
 756         u_start = tmp;
 757     }
 758     for (int i=0;i<h;i+=2)
 759     {
 760         subsample_and_interlace_2_line_sse2(dst, u_start, v_start, w, m_spd.pitch);
 761         u_start += 2*m_spd.pitch;
 762         v_start += 2*m_spd.pitch;
 763         dst += m_spd.pitch;
 764     }
 765 }
 766
 767 STDMETHODIMP CMemSubPic::AlphaBlt( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
 768 {
 769     if(!pSrc || !pDst || !pTarget) {
 770         return E_POINTER;
 771     }
 772     int src_type = m_spd.type;
 773     int dst_type = pTarget->type;
 774
 775     if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
 776                                 dst_type == MSP_RGB24 ||
 777                                 dst_type == MSP_RGB16 ||
 778                                 dst_type == MSP_RGB15 ||
 779                                 dst_type == MSP_RGBA ||
 780                                 dst_type == MSP_YUY2 ||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
 781                                 dst_type == MSP_AYUV ))
 782         ||
 783         (src_type==MSP_XY_AUYV &&  dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
 784         ||
 785         (src_type==MSP_AYUV &&  dst_type == MSP_AYUV)
 786         ||
 787         (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
 788                                 dst_type == MSP_YV12)) )
 789     {
 790         return AlphaBltOther(pSrc, pDst, pTarget);
 791     }
 792     else if ( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_NV12 ||
 793                                             dst_type == MSP_NV21 ) )
 794     {
 795         return AlphaBltAnv12_Nvxx(pSrc, pDst, pTarget);
 796     }
 797
 798     else if( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_P010 ||
 799                                            dst_type == MSP_P016 ) )
 800     {
 801         return AlphaBltAnv12_P010(pSrc, pDst, pTarget);
 802     }
 803     else if( src_type==MSP_RGBA && (dst_type == MSP_IYUV ||
 804                                     dst_type == MSP_YV12))
 805     {
 806         return AlphaBltAxyuAxyv_Yv12(pSrc, pDst, pTarget);
 807     }
 808     else if( src_type==MSP_RGBA && (dst_type == MSP_NV12||
 809                                     dst_type == MSP_NV21))
 810     {
 811         return AlphaBltAxyuAxyv_Nv12(pSrc, pDst, pTarget);
 812     }
 813     else if( src_type==MSP_RGBA && (dst_type == MSP_P010 ||
 814                                     dst_type == MSP_P016))
 815     {
 816         return AlphaBltAxyuAxyv_P010(pSrc, pDst, pTarget);
 817     }
 818     return E_NOTIMPL;
 819 }
 820
 821 STDMETHODIMP CMemSubPic::AlphaBltOther(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
 822 {
 823     const SubPicDesc& src = m_spd;
 824     SubPicDesc dst = *pTarget; // copy, because we might modify it
 825
 826     CRect rs(*pSrc), rd(*pDst);
 827     if(dst.h < 0)
 828     {
 829         dst.h = -dst.h;
 830         rd.bottom = dst.h - rd.bottom;
 831         rd.top = dst.h - rd.top;
 832     }
 833         if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
 834         return E_INVALIDARG;
 835         }
 836     int w = rs.Width(), h = rs.Height();
 837     BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);//rs.left*4
 838     BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + ((rd.left*dst.bpp)>>3);
 839     if(rd.top > rd.bottom)
 840     {
 841         if(dst.type == MSP_RGB32 || dst.type == MSP_RGB24
 842             || dst.type == MSP_RGB16 || dst.type == MSP_RGB15
 843             || dst.type == MSP_YUY2 || dst.type == MSP_AYUV)
 844         {
 845             d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*dst.bpp>>3);
 846         }
 847         else if(dst.type == MSP_YV12 || dst.type == MSP_IYUV)
 848         {
 849             d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*8>>3);
 850         }
 851         else
 852         {
 853             return E_NOTIMPL;
 854         }
 855         dst.pitch = -dst.pitch;
 856     }
 857     DbgLog((LOG_TRACE, 5, TEXT("w=%d h=%d"), w, h));
 858     switch(dst.type)
 859     {
 860     case MSP_RGBA:
 861         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 862         {
 863             BYTE* s2 = s;
 864             BYTE* s2end = s2 + w*4;
 865             DWORD* d2 = (DWORD*)d;
 866             for(; s2 < s2end; s2 += 4, d2++)
 867             {
 868                 if(s2[3] < 0xff)
 869                 {
 870                     DWORD bd =0x00000100 -( (DWORD) s2[3]);
 871                     DWORD B = ((*((DWORD*)s2)&0x000000ff)<<8)/bd;
 872                     DWORD V = ((*((DWORD*)s2)&0x0000ff00)/bd)<<8;
 873                     DWORD R = (((*((DWORD*)s2)&0x00ff0000)>>8)/bd)<<16;
 874                     *d2 = B | V | R
 875                         | (0xff000000-(*((DWORD*)s2)&0xff000000))&0xff000000;
 876                 }
 877             }
 878         }
 879         break;
 880     case MSP_RGB32:
 881     case MSP_AYUV: //ToDo: fix me MSP_VUYA indeed?
 882         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 883         {
 884             BYTE* s2 = s;
 885             BYTE* s2end = s2 + w*4;
 886             DWORD* d2 = (DWORD*)d;
 887             for(; s2 < s2end; s2 += 4, d2++)
 888             {
 889 #ifdef _WIN64
 890                                                         DWORD ia = 256-s2[3];
 891                                                         if(s2[3] < 0xff) {
 892                                                                 *d2 = ((((*d2&0x00ff00ff)*s2[3])>>8) + (((*((DWORD*)s2)&0x00ff00ff)*ia)>>8)&0x00ff00ff)
 893                                                                           | ((((*d2&0x0000ff00)*s2[3])>>8) + (((*((DWORD*)s2)&0x0000ff00)*ia)>>8)&0x0000ff00);
 894                                                         }
 895 #else
 896                 if(s2[3] < 0xff)
 897                 {
 898                     *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
 899                         | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00);
 900                 }
 901 #endif
 902             }
 903         }
 904         break;
 905     case MSP_RGB24:
 906         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 907         {
 908             BYTE* s2 = s;
 909             BYTE* s2end = s2 + w*4;
 910             BYTE* d2 = d;
 911             for(; s2 < s2end; s2 += 4, d2 += 3)
 912             {
 913                 if(s2[3] < 0xff)
 914                 {
 915                     d2[0] = ((d2[0]*s2[3])>>8) + s2[0];
 916                     d2[1] = ((d2[1]*s2[3])>>8) + s2[1];
 917                     d2[2] = ((d2[2]*s2[3])>>8) + s2[2];
 918                 }
 919             }
 920         }
 921         break;
 922     case MSP_RGB16:
 923         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 924         {
 925             BYTE* s2 = s;
 926             BYTE* s2end = s2 + w*4;
 927             WORD* d2 = (WORD*)d;
 928             for(; s2 < s2end; s2 += 4, d2++)
 929             {
 930                 if(s2[3] < 0x1f)
 931                 {
 932                     *d2 = (WORD)((((((*d2&0xf81f)*s2[3])>>5) + (*(DWORD*)s2&0xf81f))&0xf81f)
 933                         | (((((*d2&0x07e0)*s2[3])>>5) + (*(DWORD*)s2&0x07e0))&0x07e0));
 934                     /*                                  *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
 935                     | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
 936                     | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
 937                     */
 938                 }
 939             }
 940         }
 941         break;
 942     case MSP_RGB15:
 943         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 944         {
 945             BYTE* s2 = s;
 946             BYTE* s2end = s2 + w*4;
 947             WORD* d2 = (WORD*)d;
 948             for(; s2 < s2end; s2 += 4, d2++)
 949             {
 950                 if(s2[3] < 0x1f)
 951                 {
 952                     *d2 = (WORD)((((((*d2&0x7c1f)*s2[3])>>5) + (*(DWORD*)s2&0x7c1f))&0x7c1f)
 953                         | (((((*d2&0x03e0)*s2[3])>>5) + (*(DWORD*)s2&0x03e0))&0x03e0));
 954                     /*                                  *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
 955                     | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
 956                     | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
 957                     */
 958                 }
 959             }
 960         }
 961         break;
 962     case MSP_YUY2:
 963         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 964         {
 965             unsigned int ia, c;
 966             BYTE* s2 = s;
 967             BYTE* s2end = s2 + w*4;
 968             DWORD* d2 = (DWORD*)d;
 969             for(; s2 < s2end; s2 += 8, d2++)
 970             {
 971                 ia = (s2[3]+s2[7])>>1;
 972                 if(ia < 0xff)
 973                 {
 974                     //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
 975                     //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
 976                     //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
 977                     //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
 978                     //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
 979
 980                     ia = (ia<<24)|(s2[7]<<16)|(ia<<8)|s2[3];
 981                     c = (s2[4]<<24)|(s2[5]<<16)|(s2[0]<<8)|s2[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
 982                     __asm
 983                     {
 984                             mov                 edi, d2
 985                             pxor                mm0, mm0
 986                             movd                mm2, c
 987                             punpcklbw   mm2, mm0
 988                             movd                mm3, [edi]
 989                             punpcklbw   mm3, mm0
 990                             movd                mm4, ia
 991                             punpcklbw   mm4, mm0
 992                             psraw               mm4, 1          //or else, overflow because psraw shift in sign bit
 993                             pmullw              mm3, mm4
 994                             psraw               mm3, 7
 995                             paddsw              mm3, mm2
 996                             packuswb    mm3, mm3
 997                             movd                [edi], mm3
 998                     };
 999                 }
1000             }
1001         }
1002         __asm emms;
1003         break;
1004     case MSP_YV12:
1005     case MSP_IYUV:
1006         {
1007             //dst.pitch = abs(dst.pitch);
1008             int h2 = h/2;
1009             if(!dst.pitchUV)
1010             {
1011                 dst.pitchUV = abs(dst.pitch)/2;
1012             }
1013             if(!dst.bitsU || !dst.bitsV)
1014             {
1015                 dst.bitsU = (BYTE*)dst.bits + abs(dst.pitch)*dst.h;
1016                 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
1017                 if(dst.type == MSP_YV12)
1018                 {
1019                     BYTE* p = dst.bitsU;
1020                     dst.bitsU = dst.bitsV;
1021                     dst.bitsV = p;
1022                 }
1023             }
1024             BYTE* dd[2];
1025             dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
1026             dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
1027             if(rd.top > rd.bottom)
1028             {
1029                 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1030                 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1031                 dst.pitchUV = -dst.pitchUV;
1032             }
1033
1034             BYTE* src_origin= (BYTE*)src.bits + src.pitch*rs.top + rs.left;
1035
1036             BYTE* ss[2];
1037             ss[0] = src_origin + src.pitch*src.h*2;//U
1038             ss[1] = src_origin + src.pitch*src.h*3;//V
1039
1040             AlphaBltYv12Luma( d, dst.pitch, w, h, src_origin + src.pitch*src.h, src_origin, src.pitch );
1041
1042             AlphaBltYv12Chroma( dd[0], dst.pitchUV, w, h2, ss[0], src_origin, src.pitch);
1043             AlphaBltYv12Chroma( dd[1], dst.pitchUV, w, h2, ss[1], src_origin, src.pitch);
1044
1045             __asm emms;
1046         }
1047         break;
1048     default:
1049         return E_NOTIMPL;
1050         break;
1051     }
1052
1053     //emmsÒª40¸öcpuÖÜÆÚ
1054     //__asm emms;
1055     return S_OK;
1056 }
1057
1058 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_P010(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1059 {
1060     const SubPicDesc& src = m_spd;
1061     SubPicDesc dst = *pTarget; // copy, because we might modify it
1062
1063     CRect rs(*pSrc), rd(*pDst);
1064
1065     if(dst.h < 0) {
1066         dst.h = -dst.h;
1067         rd.bottom = dst.h - rd.bottom;
1068         rd.top = dst.h - rd.top;
1069     }
1070
1071     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1072         return E_INVALIDARG;
1073     }
1074
1075     int w = rs.Width(), h = rs.Height();
1076
1077     //Y
1078     BYTE* s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1079     BYTE* d = static_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1080
1081     if(rd.top > rd.bottom) {
1082         d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1083
1084         dst.pitch = -dst.pitch;
1085     }
1086
1087     for(ptrdiff_t i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1088     {
1089         BYTE* s2 = s;
1090         BYTE* s2end = s2 + w*4;
1091         WORD* d2 = reinterpret_cast<WORD*>(d);
1092         for(; s2 < s2end; s2 += 4, d2++)
1093         {
1094             if(s2[3] < 0xff) {
1095                 d2[0] = ((d2[0]*s2[3])>>8) + (s2[1]<<8);
1096             }
1097         }
1098     }
1099
1100     //UV
1101     int h2 = h/2;
1102     if(!dst.pitchUV)
1103     {
1104         dst.pitchUV = abs(dst.pitch);
1105     }
1106     if(!dst.bitsU || !dst.bitsV)
1107     {
1108         dst.bitsU = static_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1109         dst.bitsV = dst.bitsU + 2;
1110     }
1111     BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
1112     if(rd.top > rd.bottom)
1113     {
1114         ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
1115         dst.pitchUV = -dst.pitchUV;
1116     }
1117
1118     s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1119
1120     d = ddUV;
1121     int pitch = src.pitch;
1122     for(int j = 0; j < h2; j++, s += 2*src.pitch, d += dst.pitchUV )
1123     {
1124         BYTE* s2 = s;
1125         WORD* d2=reinterpret_cast<WORD*>(d);
1126         WORD* d2_end = reinterpret_cast<WORD*>(d+2*w);
1127         for( ; d2<d2_end; s2+=8, d2+=2)
1128         {
1129             unsigned int ia = (
1130                 s2[3]+          s2[3+4]+
1131                 s2[3+src.pitch]+s2[3+4+src.pitch]);
1132             if( ia!=0xFF*4 )
1133             {
1134                 d2[0] = (((d2[0])*ia)>>10) + ((s2[0] + s2[0+src.pitch])<<7);
1135                 d2[1] = (((d2[1])*ia)>>10) + ((s2[4] + s2[4+src.pitch])<<7);
1136             }
1137         }
1138     }
1139
1140     return S_OK;
1141 }
1142
1143 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_Yv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1144 {
1145     const SubPicDesc& src = m_spd;
1146     SubPicDesc dst = *pTarget; // copy, because we might modify it
1147
1148     CRect rs(*pSrc), rd(*pDst);
1149
1150     if(dst.h < 0) {
1151         dst.h = -dst.h;
1152         rd.bottom = dst.h - rd.bottom;
1153         rd.top = dst.h - rd.top;
1154     }
1155
1156     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1157         return E_INVALIDARG;
1158     }
1159
1160     int w = rs.Width(), h = rs.Height();
1161
1162     BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1163     BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1164
1165     if(rd.top > rd.bottom) {
1166         d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1167
1168         dst.pitch = -dst.pitch;
1169     }
1170
1171     for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1172         BYTE* s2 = s;
1173         BYTE* s2end = s2 + w*4;
1174         BYTE* d2 = d;
1175         for(; s2 < s2end; s2 += 4, d2++) {
1176             if(s2[3] < 0xff) {
1177                 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1178             }
1179         }
1180     }
1181     dst.pitch = abs(dst.pitch);
1182
1183     int h2 = h/2;
1184
1185     if(!dst.pitchUV) {
1186         dst.pitchUV = dst.pitch/2;
1187     }
1188
1189     BYTE* ss[2];
1190     ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1191     ss[1] = ss[0] + 4;
1192
1193     if(!dst.bitsU || !dst.bitsV) {
1194         dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1195         dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
1196
1197         if(dst.type == MSP_YV12) {
1198             BYTE* p = dst.bitsU;
1199             dst.bitsU = dst.bitsV;
1200             dst.bitsV = p;
1201         }
1202     }
1203
1204     BYTE* dd[2];
1205     dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
1206     dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
1207
1208     if(rd.top > rd.bottom) {
1209         dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1210         dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1211         dst.pitchUV = -dst.pitchUV;
1212     }
1213
1214     for(ptrdiff_t i = 0; i < 2; i++) {
1215         s = ss[i];
1216         d = dd[i];
1217         BYTE* is = ss[1-i];
1218         for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, is += src.pitch*2) {
1219             BYTE* s2 = s;
1220             BYTE* s2end = s2 + w*4;
1221             BYTE* d2 = d;
1222             BYTE* is2 = is;
1223             for(; s2 < s2end; s2 += 8, d2++, is2 += 8) {
1224                 unsigned int ia = (s2[3]+s2[3+src.pitch]+is2[3]+is2[3+src.pitch])>>2;
1225                 if(ia < 0xff) {
1226                     *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1227                 }
1228             }
1229         }
1230     }
1231
1232     return S_OK;
1233 }
1234
1235 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_Nv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1236 {
1237     const SubPicDesc& src = m_spd;
1238     SubPicDesc dst = *pTarget; // copy, because we might modify it
1239
1240     CRect rs(*pSrc), rd(*pDst);
1241
1242     if(dst.h < 0) {
1243         dst.h = -dst.h;
1244         rd.bottom = dst.h - rd.bottom;
1245         rd.top = dst.h - rd.top;
1246     }
1247
1248     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1249         return E_INVALIDARG;
1250     }
1251
1252     int w = rs.Width(), h = rs.Height();
1253
1254     BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1255     BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1256
1257     if(rd.top > rd.bottom) {
1258         d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1259
1260         dst.pitch = -dst.pitch;
1261     }
1262
1263     for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1264         BYTE* s2 = s;
1265         BYTE* s2end = s2 + w*4;
1266         BYTE* d2 = d;
1267         for(; s2 < s2end; s2 += 4, d2++) {
1268             if(s2[3] < 0xff) {
1269                 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1270             }
1271         }
1272     }
1273     dst.pitch = abs(dst.pitch);
1274
1275     int h2 = h/2;
1276
1277     if(!dst.pitchUV) {
1278         dst.pitchUV = dst.pitch;
1279     }
1280
1281     BYTE* ss[2];
1282     ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1283     ss[1] = ss[0] + 4;
1284
1285     if(!dst.bitsU || !dst.bitsV) {
1286         dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1287         dst.bitsV = dst.bitsU + 1;
1288
1289         if(dst.type == MSP_NV21) {
1290             BYTE* p = dst.bitsU;
1291             dst.bitsU = dst.bitsV;
1292             dst.bitsV = p;
1293         }
1294     }
1295
1296     BYTE* dd[2];
1297     dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1298     dd[1] = dd[0]+1;
1299
1300     if(rd.top > rd.bottom) {
1301         dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1302         dd[1] = dd[0]+1;
1303         dst.pitchUV = -dst.pitchUV;
1304     }
1305
1306     for(ptrdiff_t i = 0; i < 2; i++) {
1307         s = ss[i];
1308         d = dd[i];
1309         BYTE* is = ss[1-i];
1310         for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, is += src.pitch*2) {
1311             BYTE* s2 = s;
1312             BYTE* s2end = s2 + w*4;
1313             BYTE* d2 = d;
1314             BYTE* is2 = is;
1315             for(; s2 < s2end; s2 += 8, d2+=2, is2 += 8) {
1316                 unsigned int ia = (s2[3]+s2[3+src.pitch]+is2[3]+is2[3+src.pitch])>>2;
1317                 if(ia < 0xff) {
1318                     *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1319                 }
1320             }
1321         }
1322     }
1323
1324     return S_OK;
1325 }
1326
1327 STDMETHODIMP CMemSubPic::AlphaBltAnv12_P010( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1328 {
1329     //fix me: check colorspace and log error
1330     const SubPicDesc& src = m_spd;
1331     SubPicDesc dst = *pTarget; // copy, because we might modify it
1332
1333     CRect rs(*pSrc), rd(*pDst);
1334     if(dst.h < 0)
1335     {
1336         dst.h = -dst.h;
1337         rd.bottom = dst.h - rd.bottom;
1338         rd.top = dst.h - rd.top;
1339     }
1340     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1341         return E_INVALIDARG;
1342     }
1343     int w = rs.Width(), h = rs.Height();
1344     bool bottom_down = rs.top > rd.bottom;
1345
1346     BYTE* d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1347     if(bottom_down)
1348     {
1349         d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left*2;
1350         dst.pitch = -dst.pitch;
1351     }
1352
1353     //dst.pitch = abs(dst.pitch);
1354     int h2 = h/2;
1355     if(!dst.pitchUV)
1356     {
1357         dst.pitchUV = abs(dst.pitch);
1358     }
1359     dst.bitsU = reinterpret_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1360     BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
1361     if(bottom_down)
1362     {
1363         ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
1364         dst.pitchUV = -dst.pitchUV;
1365     }
1366
1367     BYTE* src_origin= reinterpret_cast<BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1368     BYTE *s = src_origin;
1369
1370     // equivalent:
1371     //   if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
1372     //     && (reinterpret_cast<intptr_t>(d2)&15)==0 )
1373     if( ((reinterpret_cast<intptr_t>(s) | static_cast<intptr_t>(src.pitch) |
1374         reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
1375     {
1376         for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1377         {
1378             BYTE* sa = s;
1379             BYTE* s2 = s + src.pitch*src.h;
1380             BYTE* s2end_mod16 = s2 + (w&~15);
1381             BYTE* s2end = s2 + w;
1382             BYTE* d2 = d;
1383
1384             for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=32)
1385             {
1386                 mix_16_y_p010_sse2(d2, s2, sa);
1387             }
1388             for( WORD* d3=reinterpret_cast<WORD*>(d2); s2 < s2end; s2++, sa++, d3++)
1389             {
1390                 if(sa[0] < 0xff)
1391                 {
1392                     d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
1393                 }
1394             }
1395         }
1396     }
1397     else //fix me: only a workaround for non-mod-16 size video
1398     {
1399         for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1400         {
1401             BYTE* sa = s;
1402             BYTE* s2 = s + src.pitch*src.h;
1403             BYTE* s2end_mod16 = s2 + (w&~15);
1404             BYTE* s2end = s2 + w;
1405             WORD* d2 = reinterpret_cast<WORD*>(d);
1406             for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
1407             {
1408                 if(sa[0] < 0xff)
1409                 {
1410                     d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
1411                 }
1412             }
1413         }
1414     }
1415
1416     d = ddUV;
1417     BYTE* sa = src_origin;
1418     BYTE* s_uv = src_origin + src.pitch*src.h*2;//UV
1419     if( ((reinterpret_cast<intptr_t>(sa) | static_cast<intptr_t>(src.pitch) |
1420         reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
1421     {
1422         for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1423         {
1424             BYTE* s_u2 = s_uv;
1425             BYTE* sa2 = sa;
1426             BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1427             BYTE* s_u2end = s_u2 + w;
1428             BYTE* d2 = d;
1429
1430             for(; s_u2 < s_u2end_mod16; s_u2+=16, sa2+=16, d2+=32)
1431             {
1432                 mix_16_uv_p010_sse2(d2, s_u2, sa2, src.pitch);
1433             }
1434
1435             for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1436             {
1437                 unsigned int ia = (
1438                     sa2[0]+          sa2[1]+
1439                     sa2[0+src.pitch]+sa2[1+src.pitch]);
1440                 if( ia!=0xFF*4 )
1441                 {
1442                     d3[0] = (((d3[0])*ia)>>10) + (s_u2[0]<<8);
1443                     d3[1] = (((d3[1])*ia)>>10) + (s_u2[1]<<8);
1444                 }
1445             }
1446         }
1447     }
1448     else
1449     {
1450         for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1451         {
1452             BYTE* s_u2 = s_uv;
1453             BYTE* sa2 = sa;
1454             BYTE* s_u2end = s_u2 + w;
1455             BYTE* d2 = d;
1456
1457             for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1458             {
1459                 unsigned int ia = (
1460                     sa2[0]+          sa2[1]+
1461                     sa2[0+src.pitch]+sa2[1+src.pitch]);
1462                 if( ia!=0xFF*4 )
1463                 {
1464                     d3[0] = (((d3[0])*ia)>>10) + (s_u2[0]<<8);
1465                     d3[1] = (((d3[1])*ia)>>10) + (s_u2[1]<<8);
1466                 }
1467             }
1468         }
1469     }
1470     __asm emms;
1471 }
1472
1473 STDMETHODIMP CMemSubPic::AlphaBltAnv12_Nvxx( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1474 {
1475     //fix me: check colorspace and log error
1476     const SubPicDesc& src = m_spd;
1477     SubPicDesc dst = *pTarget; // copy, because we might modify it
1478
1479     CRect rs(*pSrc), rd(*pDst);
1480     if(dst.h < 0)
1481     {
1482         dst.h = -dst.h;
1483         rd.bottom = dst.h - rd.bottom;
1484         rd.top = dst.h - rd.top;
1485     }
1486     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1487         return E_INVALIDARG;
1488     }
1489     int w = rs.Width(), h = rs.Height();
1490     bool bottom_down = rs.top > rd.bottom;
1491
1492     BYTE* d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left;
1493     if(bottom_down)
1494     {
1495         d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left;
1496         dst.pitch = -dst.pitch;
1497     }
1498
1499     //dst.pitch = abs(dst.pitch);
1500     int h2 = h/2;
1501     if(!dst.pitchUV)
1502     {
1503         dst.pitchUV = abs(dst.pitch);
1504     }
1505     if(!dst.bitsU)
1506     {
1507         dst.bitsU = reinterpret_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1508     }
1509     BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1510     if(bottom_down)
1511     {
1512         ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1513         dst.pitchUV = -dst.pitchUV;
1514     }
1515
1516     BYTE* sa= reinterpret_cast<BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1517
1518     BYTE* s_uv = sa + src.pitch*src.h*2;//UV
1519
1520     AlphaBltYv12Luma( d, dst.pitch, w, h, sa + src.pitch*src.h, sa, src.pitch );
1521     if( ((reinterpret_cast<intptr_t>(sa) | static_cast<intptr_t>(src.pitch) |
1522         reinterpret_cast<intptr_t>(ddUV) | static_cast<intptr_t>(dst.pitchUV) ) & 15 )==0 )
1523     {
1524         BYTE* d = ddUV;
1525         int pitch = src.pitch;
1526         for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1527         {
1528             BYTE* s_u2 = s_uv;
1529             BYTE* sa2 = sa;
1530             BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1531             BYTE* s_u2end = s_u2 + w;
1532             BYTE* d2 = d;
1533
1534             for(; s_u2 < s_u2end_mod16; s_u2+=16, sa2+=16, d2+=16)
1535             {
1536                 mix_16_uv_nvxx_sse2(d2, s_u2, sa2, src.pitch);
1537             }
1538             for( BYTE* d3=d2; s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1539             {
1540                 unsigned int ia = (
1541                     sa2[0]+          sa2[1]+
1542                     sa2[0+src.pitch]+sa2[1+src.pitch]);
1543                 if( ia!=0xFF*4 )
1544                 {
1545                     d3[0] = (((d3[0])*ia)>>10) + s_u2[0];
1546                     d3[1] = (((d3[1])*ia)>>10) + s_u2[1];
1547                 }
1548             }
1549         }
1550     }
1551     else
1552     {
1553         BYTE* d = ddUV;
1554         int pitch = src.pitch;
1555         for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1556         {
1557             BYTE* s_u2 = s_uv;
1558             BYTE* sa2 = sa;
1559             BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1560             BYTE* s_u2end = s_u2 + w;
1561             BYTE* d2 = d;
1562
1563             for( BYTE* d3=d2; s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1564             {
1565                 unsigned int ia = (
1566                     sa2[0]+          sa2[1]+
1567                     sa2[0+src.pitch]+sa2[1+src.pitch]);
1568                 if( ia!=0xFF*4 )
1569                 {
1570                     d3[0] = (((d3[0])*ia)>>10) + s_u2[0];
1571                     d3[1] = (((d3[1])*ia)>>10) + s_u2[1];
1572                 }
1573             }
1574         }
1575     }
1576
1577     __asm emms;
1578 }
1579
1580 STDMETHODIMP CMemSubPic::SetDirtyRectEx(CAtlList<CRect>* dirtyRectList )
1581 {
1582     //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1583     if(dirtyRectList!=NULL)
1584     {
1585         POSITION pos = dirtyRectList->GetHeadPosition();
1586         if(m_spd.type == MSP_AYUV_PLANAR || m_alpha_blt_dst_type==MSP_IYUV || m_alpha_blt_dst_type==MSP_YV12
1587             || m_alpha_blt_dst_type==MSP_P010 || m_alpha_blt_dst_type==MSP_P016
1588             || m_alpha_blt_dst_type==MSP_NV12 || m_alpha_blt_dst_type==MSP_NV21 )
1589         {
1590             while(pos!=NULL)
1591             {
1592                 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1593                 cRectSrc.left &= ~15;
1594                 cRectSrc.right = (cRectSrc.right+15)&~15;
1595                 cRectSrc.top &= ~1;
1596                 cRectSrc.bottom = (cRectSrc.bottom+1)&~1;
1597             }
1598         }
1599         else if(m_spd.type == MSP_XY_AUYV || m_alpha_blt_dst_type==MSP_YUY2)
1600         {
1601             while(pos!=NULL)
1602             {
1603                 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1604                 cRectSrc.left &= ~3;
1605                 cRectSrc.right = (cRectSrc.right+3)&~3;
1606             }
1607         }
1608     }
1609     return __super::SetDirtyRectEx(dirtyRectList);
1610 }
1611
1612 //
1613 // CMemSubPicAllocator
1614 //
1615
1616 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type, SIZE maxsize, int type/*=-1*/)
1617     : CSubPicExAllocatorImpl(maxsize, false, false)
1618     , m_alpha_blt_dst_type(alpha_blt_dst_type)
1619     , m_maxsize(maxsize)
1620     , m_type(type)
1621 {
1622     if(m_type==-1)
1623     {
1624         switch(alpha_blt_dst_type)
1625         {
1626         case MSP_YUY2:
1627             m_type = MSP_XY_AUYV;
1628             break;
1629         case MSP_AYUV:
1630             m_type = MSP_AYUV;
1631             break;
1632         case MSP_IYUV:
1633         case MSP_YV12:
1634         case MSP_P010:
1635         case MSP_P016:
1636         case MSP_NV12:
1637         case MSP_NV21:
1638             m_type = MSP_AYUV_PLANAR;
1639             break;
1640         default:
1641             m_type = MSP_RGBA;
1642             break;
1643         }
1644     }
1645 }
1646
1647 // ISubPicAllocatorImpl
1648
1649 bool CMemSubPicAllocator::AllocEx(bool fStatic, ISubPicEx** ppSubPic)
1650 {
1651         if(!ppSubPic) {
1652                 return false;
1653         }
1654     SubPicDesc spd;
1655     spd.w = m_maxsize.cx;
1656     spd.h = m_maxsize.cy;
1657     spd.bpp = 32;
1658     spd.pitch = (spd.w*spd.bpp)>>3;
1659     spd.type = m_type;
1660         spd.bits = DNew BYTE[spd.pitch*spd.h];
1661         if(!spd.bits) {
1662                 return false;
1663         }
1664         *ppSubPic = DNew CMemSubPic(spd, m_alpha_blt_dst_type);
1665         if(!(*ppSubPic)) {
1666                 return false;
1667         }
1668     (*ppSubPic)->AddRef();
1669         return true;
1670 }
1671
1672
1673