src/subpic/MemSubPic.cpp

   1 /*
   2 *       Copyright (C) 2003-2006 Gabest
   3 *       http://www.gabest.org
   4 *
   5 *  This Program is free software; you can redistribute it and/or modify
   6 *  it under the terms of the GNU General Public License as published by
   7 *  the Free Software Foundation; either version 2, or (at your option)
   8 *  any later version.
   9 *
  10 *  This Program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU General Public License
  16 *  along with GNU Make; see the file COPYING.  If not, write to
  17 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18 *  http://www.gnu.org/copyleft/gpl.html
  19 *
  20 */
  21
  22 #include "stdafx.h"
  23 #include "MemSubPic.h"
  24 #include "color_conv_table.h"
  25
  26 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
  27     m128_1 = _mm_avg_epu8(m128_1, m128_2); \
  28     m128_2 = _mm_slli_epi16(m128_1, 8); \
  29     m128_1 = _mm_srli_epi16(m128_1, 8); \
  30     m128_2 = _mm_srli_epi16(m128_2, 8); \
  31     m128_1 = _mm_avg_epu8(m128_1, m128_2);
  32
  33 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
  34     {\
  35     m128_1 = _mm_avg_epu8(m128_1, m128_2); \
  36     m128_2 = _mm_slli_epi16(m128_1, 8); \
  37     __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
  38     m128_2 = _mm_or_si128(m128_2, m128_3);\
  39     m128_1 = _mm_avg_epu8(m128_1, m128_2);\
  40     }
  41
  42 void subsample_and_interlace_2_line_c(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
  43 {
  44     const BYTE* end = u + w;
  45     for (;u<end;dst+=2,u+=2,v+=2)
  46     {
  47         dst[0] = (u[0] + u[0+pitch] + 1)/2;
  48         int tmp1 = (u[1] + u[1+pitch] + 1)/2;
  49         dst[0] = (dst[0] + tmp1 + 1)/2;
  50         dst[1] = (v[0] + v[0+pitch] + 1)/2;
  51         tmp1 = (v[1] + v[1+pitch] + 1)/2;
  52         dst[1] = (dst[1] + tmp1 + 1)/2;
  53     }
  54 }
  55
  56 __forceinline void subsample_and_interlace_2_line_sse2(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
  57 {
  58     const BYTE* end = u + w;
  59     for (;u<end;dst+=16,u+=16,v+=16)
  60     {
  61         __m128i u_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(u) );
  62         __m128i u_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(u+pitch) );
  63         __m128i v_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(v) );
  64         __m128i v_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(v+pitch) );
  65         AVERAGE_4_PIX_INTRINSICS(u_1, u_2);
  66         AVERAGE_4_PIX_INTRINSICS(v_1, v_2);
  67         u_1 = _mm_packus_epi16(u_1, u_1);
  68         v_1 = _mm_packus_epi16(v_1, v_1);
  69         u_1 = _mm_unpacklo_epi8(u_1, v_1);
  70
  71         _mm_store_si128( reinterpret_cast<__m128i*>(dst), u_1 );
  72     }
  73 }
  74
  75 static __forceinline void pix_alpha_blend_yv12_luma_sse2(byte* dst, const byte* alpha, const byte* sub)
  76 {
  77     __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
  78     __m128i alpha128 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
  79     __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(sub) );
  80     __m128i zero = _mm_setzero_si128();
  81
  82     __m128i ones;
  83 #ifdef _DEBUG
  84     ones = _mm_setzero_si128();//disable warning C4700
  85 #endif
  86     ones = _mm_cmpeq_epi32(ones,ones);
  87     ones = _mm_cmpeq_epi8(ones,alpha128);
  88
  89     __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
  90     __m128i alpha_lo128 = _mm_unpacklo_epi8(alpha128, zero);
  91
  92     __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
  93
  94     dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha_lo128);
  95     dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
  96     dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
  97
  98     dst128 = _mm_unpackhi_epi8(dst128, zero);
  99     alpha128 = _mm_unpackhi_epi8(alpha128, zero);
 100
 101     ones2 = _mm_unpackhi_epi8(ones, zero);
 102
 103     dst128 = _mm_mullo_epi16(dst128, alpha128);
 104     dst128 = _mm_adds_epu16(dst128, ones2);
 105     dst128 = _mm_srli_epi16(dst128, 8);
 106     dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
 107
 108     dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
 109     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
 110 }
 111
 112 /***
 113  * output not exactly identical to pix_alpha_blend_yv12_chroma
 114  */
 115 static __forceinline void pix_alpha_blend_yv12_chroma_sse2(byte* dst, const byte* src, const byte* alpha, int src_pitch)
 116 {
 117     __m128i zero = _mm_setzero_si128();
 118     __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
 119     __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha+src_pitch) );
 120     __m128i dst128 = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(dst) );
 121
 122     __m128i sub128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
 123     __m128i sub128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src+src_pitch) );
 124
 125     AVERAGE_4_PIX_INTRINSICS(alpha128_1, alpha128_2);
 126
 127     __m128i ones;
 128 #ifdef _DEBUG
 129     ones = _mm_setzero_si128();//disable warning C4700
 130 #endif
 131     ones = _mm_cmpeq_epi32(ones,ones);
 132     ones = _mm_cmpeq_epi8(ones, alpha128_1);
 133
 134     dst128 = _mm_unpacklo_epi8(dst128, zero);
 135     __m128i dst128_2 = _mm_and_si128(dst128, ones);
 136
 137     dst128 = _mm_mullo_epi16(dst128, alpha128_1);
 138     dst128 = _mm_adds_epu16(dst128, dst128_2);
 139
 140     dst128 = _mm_srli_epi16(dst128, 8);
 141
 142     AVERAGE_4_PIX_INTRINSICS(sub128_1, sub128_2);
 143
 144     dst128 = _mm_adds_epi16(dst128, sub128_1);
 145     dst128 = _mm_packus_epi16(dst128, dst128);
 146
 147     _mm_storel_epi64( reinterpret_cast<__m128i*>(dst), dst128 );
 148 }
 149
 150 static __forceinline void pix_alpha_blend_yv12_chroma(byte* dst, const byte* src, const byte* alpha, int src_pitch)
 151 {
 152     unsigned int ia = (alpha[0]+alpha[1]+
 153         alpha[0+src_pitch]+alpha[1+src_pitch])>>2;
 154     if(ia!=0xff)
 155     {
 156         *dst= (((*dst)*ia)>>8) + ((src[0]        +src[1]+
 157             src[src_pitch]+src[1+src_pitch] )>>2);
 158     }
 159 }
 160
 161 static void AlphaBltYv12Luma(byte* dst, int dst_pitch,
 162     int w, int h,
 163     const byte* sub, const byte* alpha, int sub_pitch)
 164 {
 165     if( ((reinterpret_cast<intptr_t>(alpha) | static_cast<intptr_t>(sub_pitch) |
 166         reinterpret_cast<intptr_t>(dst) | static_cast<intptr_t>(dst_pitch) ) & 15 )==0 )
 167     {
 168         for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
 169         {
 170             const BYTE* sa = alpha;
 171             const BYTE* s2 = sub;
 172             const BYTE* s2end_mod16 = s2 + (w&~15);
 173             const BYTE* s2end = s2 + w;
 174             BYTE* d2 = dst;
 175
 176             for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=16)
 177             {
 178                 pix_alpha_blend_yv12_luma_sse2(d2, sa, s2);
 179             }
 180             for(; s2 < s2end; s2++, sa++, d2++)
 181             {
 182                 if(sa[0] < 0xff)
 183                 {
 184                     d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
 185                 }
 186             }
 187         }
 188     }
 189     else //fix me: only a workaround for non-mod-16 size video
 190     {
 191         for(int i=0; i<h; i++, dst += dst_pitch, alpha += sub_pitch, sub += sub_pitch)
 192         {
 193             const BYTE* sa = alpha;
 194             const BYTE* s2 = sub;
 195             const BYTE* s2end_mod16 = s2 + (w&~15);
 196             const BYTE* s2end = s2 + w;
 197             BYTE* d2 = dst;
 198             for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
 199             {
 200                 if(sa[0] < 0xff)
 201                 {
 202                     //                                  d2[0] = (((d2[0]-0x10)*s2[3])>>8) + s2[1];
 203                     d2[0] = ((d2[0]*sa[0])>>8) + s2[0];
 204                 }
 205             }
 206         }
 207     }
 208 }
 209
 210 static void AlphaBltYv12Chroma(byte* dst, int dst_pitch,
 211     int w, int chroma_h,
 212     const byte* sub_chroma, const byte* alpha, int sub_pitch)
 213 {
 214     if( ((reinterpret_cast<intptr_t>(sub_chroma) |
 215         //reinterpret_cast<intptr_t>(dst) |
 216         reinterpret_cast<intptr_t>(alpha) | static_cast<intptr_t>(sub_pitch)
 217         //| (static_cast<intptr_t>(dst_pitch)&7)
 218         ) & 15 )==0 )
 219     {
 220         int pitch = sub_pitch;
 221         for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
 222         {
 223             const BYTE* s2 = sub_chroma;
 224             const BYTE* sa2 = alpha;
 225             const BYTE* s2end_mod16 = s2 + (w&~15);
 226             const BYTE* s2end = s2 + w;
 227             BYTE* d2 = dst;
 228
 229             for(; s2 < s2end_mod16; s2 += 16, sa2 += 16, d2+=8)
 230             {
 231                 pix_alpha_blend_yv12_chroma_sse2(d2, s2, sa2, sub_pitch);
 232             }
 233             for(; s2 < s2end; s2+=2, sa2+=2, d2++)
 234             {
 235                 pix_alpha_blend_yv12_chroma(d2, s2, sa2, sub_pitch);
 236             }
 237         }
 238     }
 239     else//fix me: only a workaround for non-mod-16 size video
 240     {
 241         for(int j = 0; j < chroma_h; j++, sub_chroma += sub_pitch*2, alpha += sub_pitch*2, dst += dst_pitch)
 242         {
 243             const BYTE* s2 = sub_chroma;
 244             const BYTE* sa2 = alpha;
 245             const BYTE* s2end_mod16 = s2 + (w&~15);
 246             const BYTE* s2end = s2 + w;
 247             BYTE* d2 = dst;
 248             for(; s2 < s2end; s2 += 2, sa2 += 2, d2++)
 249             {
 250                 pix_alpha_blend_yv12_chroma(d2, s2, sa2, sub_pitch);
 251             }
 252         }
 253     }
 254 }
 255
 256 __forceinline void mix_16_y_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
 257 {
 258     //important!
 259     __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
 260     __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
 261     __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 262
 263     __m128i alpha_ff;
 264 #ifdef _DEBUG
 265     alpha_ff = _mm_setzero_si128();//disable warning C4700
 266 #endif
 267     alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
 268
 269     alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
 270
 271     __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
 272     //so we do it another way
 273     //first, (alpha<<8)+0xff
 274     __m128i ones = _mm_setzero_si128();
 275     ones = _mm_cmpeq_epi16(dst_y, ones);
 276
 277     __m128i ones2;
 278 #ifdef _DEBUG
 279     ones2 = _mm_setzero_si128();//disable warning C4700
 280 #endif
 281     ones2 = _mm_cmpeq_epi32(ones2,ones2);
 282
 283     ones = _mm_xor_si128(ones, ones2);
 284     ones = _mm_srli_epi16(ones, 15);
 285     ones = _mm_and_si128(ones, lo);
 286
 287     dst_y = _mm_mulhi_epu16(dst_y, lo);
 288     dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
 289
 290     lo = _mm_setzero_si128();
 291     lo = _mm_unpacklo_epi8(lo, src_y);
 292     dst_y = _mm_adds_epu16(dst_y, lo);
 293     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
 294
 295     dst += 16;
 296     dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 297
 298     lo = _mm_unpackhi_epi8(alpha_ff, alpha);
 299
 300     ones = _mm_setzero_si128();
 301     ones = _mm_cmpeq_epi16(dst_y, ones);
 302     ones = _mm_xor_si128(ones, ones2);
 303     ones = _mm_srli_epi16(ones, 15);
 304     ones = _mm_and_si128(ones, lo);
 305
 306     dst_y = _mm_mulhi_epu16(dst_y, lo);
 307     dst_y = _mm_adds_epu16(dst_y, ones);
 308
 309     lo = _mm_setzero_si128();
 310     lo = _mm_unpackhi_epi8(lo, src_y);
 311     dst_y = _mm_adds_epu16(dst_y, lo);
 312     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
 313 }
 314
 315 //for test only
 316 void mix_16_y_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
 317 {
 318     WORD* dst_word = reinterpret_cast<WORD*>(dst);
 319     for (int i=0;i<16;i++)
 320     {
 321         if (src_alpha[i]!=0xff)
 322         {
 323             dst_word[i] = ((dst_word[i] *src_alpha[i])>>8) + (src[i]<<8);
 324         }
 325     }
 326 }
 327
 328 __forceinline void mix_16_uv_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
 329 {
 330     //important!
 331     __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
 332     __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
 333
 334     __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
 335     __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 336
 337     AVERAGE_4_PIX_INTRINSICS_2(alpha, alpha2);
 338
 339     __m128i alpha_ff;
 340 #ifdef _DEBUG
 341     alpha_ff = _mm_setzero_si128();//disable warning C4700
 342 #endif
 343     alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
 344
 345     alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
 346
 347     __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
 348     //so we do it another way
 349     //first, (alpha<<8)+0xff
 350     __m128i ones = _mm_setzero_si128();
 351     ones = _mm_cmpeq_epi16(dst_y, ones);
 352
 353     __m128i ones2;
 354 #ifdef _DEBUG
 355     ones2 = _mm_setzero_si128();//disable warning C4700
 356 #endif
 357     ones2 = _mm_cmpeq_epi32(ones2,ones2);
 358     ones = _mm_xor_si128(ones, ones2);
 359     ones = _mm_srli_epi16(ones, 15);
 360     ones = _mm_and_si128(ones, lo);
 361
 362     dst_y = _mm_mulhi_epu16(dst_y, lo);
 363     dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
 364
 365     lo = _mm_setzero_si128();
 366     lo = _mm_unpacklo_epi8(lo, src_y);
 367     dst_y = _mm_adds_epu16(dst_y, lo);
 368     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
 369
 370     dst += 16;
 371     dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 372
 373     lo = _mm_unpackhi_epi8(alpha_ff, alpha);
 374
 375     ones = _mm_setzero_si128();
 376     ones = _mm_cmpeq_epi16(dst_y, ones);
 377     ones = _mm_xor_si128(ones, ones2);
 378     ones = _mm_srli_epi16(ones, 15);
 379     ones = _mm_and_si128(ones, lo);
 380
 381     dst_y = _mm_mulhi_epu16(dst_y, lo);
 382     dst_y = _mm_adds_epu16(dst_y, ones);
 383
 384     lo = _mm_setzero_si128();
 385     lo = _mm_unpackhi_epi8(lo, src_y);
 386     dst_y = _mm_adds_epu16(dst_y, lo);
 387     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
 388 }
 389
 390 //for test only
 391 void mix_16_uv_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
 392 {
 393     WORD* dst_word = reinterpret_cast<WORD*>(dst);
 394     for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst_word+=2)
 395     {
 396         unsigned int ia = (
 397             (src_alpha[0]+src_alpha[0+pitch]+1)/2+
 398             (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
 399         if( ia!=0xFF )
 400         {
 401             int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);
 402             if(tmp>0xffff) tmp = 0xffff;
 403             dst_word[0] = tmp;
 404             tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);
 405             if(tmp>0xffff) tmp = 0xffff;
 406             dst_word[1] = tmp;
 407         }
 408     }
 409 }
 410
 411 __forceinline void mix_16_uv_nvxx_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
 412 {
 413     __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
 414     __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
 415     __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
 416     __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
 417
 418     AVERAGE_4_PIX_INTRINSICS_2(alpha128_1, alpha128_2);
 419     __m128i zero = _mm_setzero_si128();
 420
 421     __m128i ones;
 422 #ifdef _DEBUG
 423     ones = _mm_setzero_si128();//disable warning C4700
 424 #endif
 425     ones = _mm_cmpeq_epi32(ones,ones);
 426     ones = _mm_cmpeq_epi8(ones,alpha128_1);
 427
 428     __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
 429     alpha128_2 = _mm_unpacklo_epi8(alpha128_1, zero);
 430
 431     __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
 432
 433     dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha128_2);
 434     dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
 435     dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
 436
 437     dst128 = _mm_unpackhi_epi8(dst128, zero);
 438     alpha128_1 = _mm_unpackhi_epi8(alpha128_1, zero);
 439
 440     ones2 = _mm_unpackhi_epi8(ones, zero);
 441
 442     dst128 = _mm_mullo_epi16(dst128, alpha128_1);
 443     dst128 = _mm_adds_epu16(dst128, ones2);
 444     dst128 = _mm_srli_epi16(dst128, 8);
 445     dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
 446
 447     dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
 448     _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
 449 }
 450
 451 //for test only
 452 void mix_16_uv_nvxx_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
 453 {
 454     for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst+=2)
 455     {
 456         unsigned int ia = (
 457             (src_alpha[0]+src_alpha[0+pitch]+1)/2+
 458             (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
 459         if( ia!=0xFF )
 460         {
 461             dst[0] = (((dst[0])*ia)>>8) + src[0];
 462             dst[1] = (((dst[1])*ia)>>8) + src[1];
 463         }
 464     }
 465 }
 466
 467 //
 468 // CMemSubPic
 469 //
 470
 471 CMemSubPic::CMemSubPic(SubPicDesc& spd, int alpha_blt_dst_type)
 472     : m_spd(spd), m_alpha_blt_dst_type(alpha_blt_dst_type)
 473 {
 474     m_maxsize.SetSize(spd.w, spd.h);
 475     //  m_rcDirty.SetRect(0, 0, spd.w, spd.h);
 476     CRect allSpd(0,0,spd.w, spd.h);
 477     m_rectListDirty.AddTail(allSpd);
 478 }
 479
 480 CMemSubPic::~CMemSubPic()
 481 {
 482     delete [] m_spd.bits, m_spd.bits = NULL;
 483 }
 484
 485 // ISubPic
 486
 487 STDMETHODIMP_(void*) CMemSubPic::GetObject() const
 488 {
 489     return (void*)&m_spd;
 490 }
 491
 492 STDMETHODIMP CMemSubPic::GetDesc(SubPicDesc& spd) const
 493 {
 494     spd.type = m_spd.type;
 495     spd.w = m_size.cx;
 496     spd.h = m_size.cy;
 497     spd.bpp = m_spd.bpp;
 498     spd.pitch = m_spd.pitch;
 499     spd.bits = m_spd.bits;
 500     spd.bitsU = m_spd.bitsU;
 501     spd.bitsV = m_spd.bitsV;
 502     spd.vidrect = m_vidrect;
 503     return S_OK;
 504 }
 505
 506 STDMETHODIMP CMemSubPic::CopyTo(ISubPicEx* pSubPic)
 507 {
 508     HRESULT hr;
 509         if(FAILED(hr = __super::CopyTo(pSubPic))) {
 510         return hr;
 511         }
 512
 513         SubPicDesc src, dst;
 514         if(FAILED(GetDesc(src)) || FAILED(pSubPic->GetDesc(dst))) {
 515         return E_FAIL;
 516     }
 517     while(!m_rectListDirty.IsEmpty())
 518     {
 519         CRect& cRect = m_rectListDirty.GetHead();
 520         int w = cRect.Width(), h = cRect.Height();
 521         BYTE* s = (BYTE*)src.bits + src.pitch*cRect.top + cRect.left*4;
 522         BYTE* d = (BYTE*)dst.bits + dst.pitch*cRect.top + cRect.left*4;
 523         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 524             memcpy(d, s, w*4);
 525     }
 526     return S_OK;
 527 }
 528
 529 STDMETHODIMP CMemSubPic::ClearDirtyRect(DWORD color)
 530 {
 531     if(m_rectListDirty.IsEmpty()) {
 532         return S_OK;
 533         }
 534     while(!m_rectListDirty.IsEmpty())
 535     {
 536         //pDirtyRect = m_rectListDirty.RemoveHead();
 537         CRect& dirtyRect = m_rectListDirty.RemoveTail();
 538         BYTE* p = (BYTE*)m_spd.bits + m_spd.pitch*(dirtyRect.top) + dirtyRect.left*(m_spd.bpp>>3);
 539         int w = dirtyRect.Width();
 540         if(m_spd.type!=MSP_AYUV_PLANAR)
 541         {
 542             for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
 543             {
 544 #ifdef _WIN64
 545                                 memsetd(p, color, w*4); // nya
 546 #else
 547                 __asm
 548                 {
 549                         mov eax, color
 550                         mov ecx, w
 551                         mov edi, p
 552                         cld
 553                         rep stosd
 554                 }
 555
 556 #endif
 557             }
 558         }
 559         else
 560         {
 561             ///TODO:
 562             ///FIX ME
 563             for(int j = 0, h = dirtyRect.Height(); j < h; j++, p += m_spd.pitch)
 564             {
 565                 //        memsetd(p, 0, m_rcDirty.Width());
 566                 //DbgLog((LOG_TRACE, 3, "w:%d", w));
 567                 //w = pDirtyRect->Width();
 568                 memset(p, 0xFF, w);
 569                 memset(p+m_spd.h*m_spd.pitch, 0, w);
 570                 memset(p+m_spd.h*m_spd.pitch*2, 0, w);
 571                 memset(p+m_spd.h*m_spd.pitch*3, 0, w);
 572             }
 573         }
 574     }
 575         m_rectListDirty.RemoveAll();
 576     return S_OK;
 577 }
 578
 579 STDMETHODIMP CMemSubPic::Lock(SubPicDesc& spd)
 580 {
 581     return GetDesc(spd);
 582 }
 583
 584 STDMETHODIMP CMemSubPic::Unlock( CAtlList<CRect>* dirtyRectList )
 585 {
 586     int src_type = m_spd.type;
 587     int dst_type = m_alpha_blt_dst_type;
 588     if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
 589                                 dst_type == MSP_RGB24 ||
 590                                 dst_type == MSP_RGB16 ||
 591                                 dst_type == MSP_RGB15))
 592         ||
 593         (src_type==MSP_XY_AUYV &&  dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
 594         ||
 595         (src_type==MSP_AYUV &&  dst_type == MSP_AYUV)
 596         ||
 597         (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
 598                                 dst_type == MSP_YV12 ||
 599                                 dst_type == MSP_P010 ||
 600                                 dst_type == MSP_P016 ||
 601                                 dst_type == MSP_NV12 ||
 602                                 dst_type == MSP_NV21)))
 603     {
 604         return UnlockOther(dirtyRectList);
 605     }
 606     else if(src_type==MSP_RGBA && (dst_type == MSP_YUY2 ||
 607                                    dst_type == MSP_AYUV || //ToDo: fix me MSP_AYUV
 608                                    dst_type == MSP_IYUV ||
 609                                    dst_type == MSP_YV12 ||
 610                                    dst_type == MSP_NV12 ||
 611                                    dst_type == MSP_NV21 ||
 612                                    dst_type == MSP_P010 ||
 613                                    dst_type == MSP_P016))
 614     {
 615         return UnlockRGBA_YUV(dirtyRectList);
 616     }
 617     return E_NOTIMPL;
 618 }
 619
 620 STDMETHODIMP CMemSubPic::UnlockOther(CAtlList<CRect>* dirtyRectList)
 621 {
 622     SetDirtyRectEx(dirtyRectList);
 623     if(m_rectListDirty.IsEmpty()) {
 624         return S_OK;
 625     }
 626
 627     POSITION pos = m_rectListDirty.GetHeadPosition();
 628     while(pos!=NULL)
 629     {
 630         const CRect& cRect = m_rectListDirty.GetNext(pos);
 631         int w = cRect.Width(), h = cRect.Height();
 632         BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*(cRect.top) + cRect.left*4;
 633         BYTE* bottom = top + m_spd.pitch*h;
 634         if(m_alpha_blt_dst_type == MSP_RGB16)
 635         {
 636             for(; top < bottom ; top += m_spd.pitch)
 637             {
 638                 DWORD* s = (DWORD*)top;
 639                 DWORD* e = s + w;
 640                 for(; s < e; s++)
 641                 {
 642                     *s = ((*s>>3)&0x1f000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
 643                     //                          *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
 644                 }
 645             }
 646         }
 647         else if(m_alpha_blt_dst_type == MSP_RGB15)
 648         {
 649             for(; top < bottom; top += m_spd.pitch)
 650             {
 651                 DWORD* s = (DWORD*)top;
 652                 DWORD* e = s + w;
 653                 for(; s < e; s++)
 654                 {
 655                     *s = ((*s>>3)&0x1f000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
 656                     //                          *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
 657                 }
 658             }
 659         }
 660         else if(m_alpha_blt_dst_type == MSP_YUY2)
 661         {
 662             XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top, m_spd.pitch*(h-1)) );
 663
 664             for(BYTE* tempTop=top; tempTop < bottom ; tempTop += m_spd.pitch)
 665             {
 666                 BYTE* s = tempTop;
 667                 BYTE* e = s + w*4;
 668                 for(; s < e; s+=8) // AUYV AUYV -> AxYU AxYV
 669                 {
 670                     s[4] = (s[0] + s[4])>>1;
 671                     s[0] = (s[2] + s[6])>>1;
 672                 }
 673             }
 674
 675             XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", top, m_spd.pitch*(h-1)) );
 676         }
 677         else if(m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV )
 678         {
 679             //nothing to do
 680         }
 681         else if ( m_alpha_blt_dst_type == MSP_P010 || m_alpha_blt_dst_type == MSP_P016
 682             || m_alpha_blt_dst_type == MSP_NV12 )
 683         {
 684             SubsampleAndInterlace(cRect, true);
 685         }
 686         else if( m_alpha_blt_dst_type == MSP_NV21 )
 687         {
 688             SubsampleAndInterlace(cRect, false);
 689         }
 690     }
 691     return S_OK;
 692 }
 693
 694 STDMETHODIMP CMemSubPic::UnlockRGBA_YUV(CAtlList<CRect>* dirtyRectList)
 695 {
 696     SetDirtyRectEx(dirtyRectList);
 697     if(m_rectListDirty.IsEmpty()) {
 698         return S_OK;
 699     }
 700
 701     const ColorConvTable *conv_table = ColorConvTable::GetDefaultColorConvTable();
 702     const int *c2y_yb = conv_table->c2y_yb;
 703     const int *c2y_yg = conv_table->c2y_yg;
 704     const int *c2y_yr = conv_table->c2y_yr;
 705     const int cy_cy2 = conv_table->cy_cy2;
 706     const int c2y_cu = conv_table->c2y_cu;
 707     const int c2y_cv = conv_table->c2y_cv;
 708     const int cy_cy = conv_table->cy_cy;
 709     const unsigned char* Clip = conv_table->Clip;
 710
 711     POSITION pos = m_rectListDirty.GetHeadPosition();
 712     while(pos!=NULL)
 713     {
 714         const CRect& cRect = m_rectListDirty.GetNext(pos);
 715         int w = cRect.Width(), h = cRect.Height();
 716
 717         BYTE* top = (BYTE*)m_spd.bits + m_spd.pitch*cRect.top + cRect.left*4;
 718         BYTE* bottom = top + m_spd.pitch*h;
 719
 720         if( m_alpha_blt_dst_type == MSP_YUY2 ||
 721             m_alpha_blt_dst_type == MSP_YV12 ||
 722             m_alpha_blt_dst_type == MSP_IYUV ||
 723             m_alpha_blt_dst_type == MSP_P010 ||
 724             m_alpha_blt_dst_type == MSP_P016 ||
 725             m_alpha_blt_dst_type == MSP_NV12 ||
 726             m_alpha_blt_dst_type == MSP_NV21) {
 727             for(; top < bottom ; top += m_spd.pitch) {
 728                 BYTE* s = top;
 729                 BYTE* e = s + w*4;
 730                 for(; s < e; s+=8) { // ARGB ARGB -> AxYU AxYV
 731                     if((s[3]+s[7]) < 0x1fe) {
 732                         int a = 0x200 - (s[3]+s[7]);
 733                         a <<= 7;
 734                         // 0 <= a <= 0x10000
 735                         s[1] = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a  + 0x8000) >> 16;
 736                         s[5] = (c2y_yb[s[4]] + c2y_yg[s[5]] + c2y_yr[s[6]] + 0x10*a  + 0x8000) >> 16;
 737
 738                         int scaled_y = (s[1]+s[5]-32) * cy_cy2;
 739
 740                         s[0] = Clip[(((((s[0]+s[4])<<15) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
 741                         s[4] = Clip[(((((s[2]+s[6])<<15) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
 742                     } else {
 743                         s[1] = s[5] = 0;
 744                         s[0] = s[4] = 0;
 745                     }
 746                 }
 747             }
 748         }
 749         else if(m_alpha_blt_dst_type == MSP_AYUV) {
 750             for(; top < bottom ; top += m_spd.pitch) {
 751                 BYTE* s = top;
 752                 BYTE* e = s + w*4;
 753                 for(; s < e; s+=4) { // ARGB -> AYUV
 754                     if(s[3] < 0xff) {
 755                         int a = 0x100 - s[3];
 756                         a <<= 8;
 757                         // 0 <= a <= 0x10000
 758
 759                         int y = (c2y_yb[s[0]] + c2y_yg[s[1]] + c2y_yr[s[2]] + 0x10*a + 0x8000) >> 16;
 760                         int scaled_y = (y-32) * cy_cy;
 761                         s[1] = Clip[((((s[0]<<16) - scaled_y) >> 10) * c2y_cu + 0x80*a + 0x8000) >> 16];
 762                         s[0] = Clip[((((s[2]<<16) - scaled_y) >> 10) * c2y_cv + 0x80*a + 0x8000) >> 16];
 763                         s[2] = y;
 764                     } else {
 765                         s[0] = s[1] = 0;
 766                         s[2] = 0;
 767                     }
 768                 }
 769             }
 770         }
 771     }
 772     return S_OK;
 773 }
 774
 775 void CMemSubPic::SubsampleAndInterlace( const CRect& cRect, bool u_first )
 776 {
 777     //fix me: check alignment and log error
 778     int w = cRect.Width(), h = cRect.Height();
 779     BYTE* u_plan = reinterpret_cast<BYTE*>(m_spd.bits) + m_spd.pitch*m_spd.h*2;
 780     BYTE* u_start = u_plan + m_spd.pitch*(cRect.top)+ cRect.left;
 781     BYTE* v_start = u_start + m_spd.pitch*m_spd.h;
 782     BYTE* dst = u_start;
 783     if(!u_first)
 784     {
 785         BYTE* tmp = v_start;
 786         v_start = u_start;
 787         u_start = tmp;
 788     }
 789
 790     //Todo: fix me.
 791     //Walkarround for alignment
 792     if ( (m_spd.pitch&15) == 0 )
 793     {
 794         for (int i=0;i<h;i+=2)
 795         {
 796             subsample_and_interlace_2_line_sse2(dst, u_start, v_start, w, m_spd.pitch);
 797             u_start += 2*m_spd.pitch;
 798             v_start += 2*m_spd.pitch;
 799             dst += m_spd.pitch;
 800         }
 801     }
 802     else
 803     {
 804         for (int i=0;i<h;i+=2)
 805         {
 806             subsample_and_interlace_2_line_c(dst, u_start, v_start, w, m_spd.pitch);
 807             u_start += 2*m_spd.pitch;
 808             v_start += 2*m_spd.pitch;
 809             dst += m_spd.pitch;
 810         }
 811     }
 812 }
 813
 814 STDMETHODIMP CMemSubPic::AlphaBlt( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
 815 {
 816     if(!pSrc || !pDst || !pTarget) {
 817         return E_POINTER;
 818     }
 819     int src_type = m_spd.type;
 820     int dst_type = pTarget->type;
 821
 822     if( (src_type==MSP_RGBA && (dst_type == MSP_RGB32 ||
 823                                 dst_type == MSP_RGB24 ||
 824                                 dst_type == MSP_RGB16 ||
 825                                 dst_type == MSP_RGB15 ||
 826                                 dst_type == MSP_RGBA ||
 827                                 dst_type == MSP_YUY2 ||//ToDo: fix me MSP_RGBA changed into AxYU AxYV after unlock, may be confusing
 828                                 dst_type == MSP_AYUV ))
 829         ||
 830         (src_type==MSP_XY_AUYV &&  dst_type == MSP_YUY2)//ToDo: fix me MSP_AYUV
 831         ||
 832         (src_type==MSP_AYUV &&  dst_type == MSP_AYUV)
 833         ||
 834         (src_type==MSP_AYUV_PLANAR && (dst_type == MSP_IYUV ||
 835                                 dst_type == MSP_YV12)) )
 836     {
 837         return AlphaBltOther(pSrc, pDst, pTarget);
 838     }
 839     else if ( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_NV12 ||
 840                                             dst_type == MSP_NV21 ) )
 841     {
 842         return AlphaBltAnv12_Nvxx(pSrc, pDst, pTarget);
 843     }
 844
 845     else if( src_type==MSP_AYUV_PLANAR && (dst_type == MSP_P010 ||
 846                                            dst_type == MSP_P016 ) )
 847     {
 848         return AlphaBltAnv12_P010(pSrc, pDst, pTarget);
 849     }
 850     else if( src_type==MSP_RGBA && (dst_type == MSP_IYUV ||
 851                                     dst_type == MSP_YV12))
 852     {
 853         return AlphaBltAxyuAxyv_Yv12(pSrc, pDst, pTarget);
 854     }
 855     else if( src_type==MSP_RGBA && (dst_type == MSP_NV12||
 856                                     dst_type == MSP_NV21))
 857     {
 858         return AlphaBltAxyuAxyv_Nv12(pSrc, pDst, pTarget);
 859     }
 860     else if( src_type==MSP_RGBA && (dst_type == MSP_P010 ||
 861                                     dst_type == MSP_P016))
 862     {
 863         return AlphaBltAxyuAxyv_P010(pSrc, pDst, pTarget);
 864     }
 865     return E_NOTIMPL;
 866 }
 867
 868 STDMETHODIMP CMemSubPic::AlphaBltOther(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
 869 {
 870     const SubPicDesc& src = m_spd;
 871     SubPicDesc dst = *pTarget; // copy, because we might modify it
 872
 873     CRect rs(*pSrc), rd(*pDst);
 874     if(dst.h < 0)
 875     {
 876         dst.h = -dst.h;
 877         rd.bottom = dst.h - rd.bottom;
 878         rd.top = dst.h - rd.top;
 879     }
 880         if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
 881         return E_INVALIDARG;
 882         }
 883     int w = rs.Width(), h = rs.Height();
 884     BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);//rs.left*4
 885     BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + ((rd.left*dst.bpp)>>3);
 886     if(rd.top > rd.bottom)
 887     {
 888         if(dst.type == MSP_RGB32 || dst.type == MSP_RGB24
 889             || dst.type == MSP_RGB16 || dst.type == MSP_RGB15
 890             || dst.type == MSP_YUY2 || dst.type == MSP_AYUV)
 891         {
 892             d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*dst.bpp>>3);
 893         }
 894         else if(dst.type == MSP_YV12 || dst.type == MSP_IYUV)
 895         {
 896             d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + (rd.left*8>>3);
 897         }
 898         else
 899         {
 900             return E_NOTIMPL;
 901         }
 902         dst.pitch = -dst.pitch;
 903     }
 904     DbgLog((LOG_TRACE, 5, TEXT("w=%d h=%d"), w, h));
 905     switch(dst.type)
 906     {
 907     case MSP_RGBA:
 908         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 909         {
 910             BYTE* s2 = s;
 911             BYTE* s2end = s2 + w*4;
 912             DWORD* d2 = (DWORD*)d;
 913             for(; s2 < s2end; s2 += 4, d2++)
 914             {
 915                 if(s2[3] < 0xff)
 916                 {
 917                     DWORD bd =0x00000100 -( (DWORD) s2[3]);
 918                     DWORD B = ((*((DWORD*)s2)&0x000000ff)<<8)/bd;
 919                     DWORD V = ((*((DWORD*)s2)&0x0000ff00)/bd)<<8;
 920                     DWORD R = (((*((DWORD*)s2)&0x00ff0000)>>8)/bd)<<16;
 921                     *d2 = B | V | R
 922                         | (0xff000000-(*((DWORD*)s2)&0xff000000))&0xff000000;
 923                 }
 924             }
 925         }
 926         break;
 927     case MSP_RGB32:
 928     case MSP_AYUV: //ToDo: fix me MSP_VUYA indeed?
 929         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 930         {
 931             BYTE* s2 = s;
 932             BYTE* s2end = s2 + w*4;
 933             DWORD* d2 = (DWORD*)d;
 934             for(; s2 < s2end; s2 += 4, d2++)
 935             {
 936 #ifdef _WIN64
 937                                                         DWORD ia = 256-s2[3];
 938                                                         if(s2[3] < 0xff) {
 939                                                                 *d2 = ((((*d2&0x00ff00ff)*s2[3])>>8) + (((*((DWORD*)s2)&0x00ff00ff)*ia)>>8)&0x00ff00ff)
 940                                                                           | ((((*d2&0x0000ff00)*s2[3])>>8) + (((*((DWORD*)s2)&0x0000ff00)*ia)>>8)&0x0000ff00);
 941                                                         }
 942 #else
 943                 if(s2[3] < 0xff)
 944                 {
 945                     *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
 946                         | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00);
 947                 }
 948 #endif
 949             }
 950         }
 951         break;
 952     case MSP_RGB24:
 953         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 954         {
 955             BYTE* s2 = s;
 956             BYTE* s2end = s2 + w*4;
 957             BYTE* d2 = d;
 958             for(; s2 < s2end; s2 += 4, d2 += 3)
 959             {
 960                 if(s2[3] < 0xff)
 961                 {
 962                     d2[0] = ((d2[0]*s2[3])>>8) + s2[0];
 963                     d2[1] = ((d2[1]*s2[3])>>8) + s2[1];
 964                     d2[2] = ((d2[2]*s2[3])>>8) + s2[2];
 965                 }
 966             }
 967         }
 968         break;
 969     case MSP_RGB16:
 970         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 971         {
 972             BYTE* s2 = s;
 973             BYTE* s2end = s2 + w*4;
 974             WORD* d2 = (WORD*)d;
 975             for(; s2 < s2end; s2 += 4, d2++)
 976             {
 977                 if(s2[3] < 0x1f)
 978                 {
 979                     *d2 = (WORD)((((((*d2&0xf81f)*s2[3])>>5) + (*(DWORD*)s2&0xf81f))&0xf81f)
 980                         | (((((*d2&0x07e0)*s2[3])>>5) + (*(DWORD*)s2&0x07e0))&0x07e0));
 981                     /*                                  *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
 982                     | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
 983                     | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
 984                     */
 985                 }
 986             }
 987         }
 988         break;
 989     case MSP_RGB15:
 990         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
 991         {
 992             BYTE* s2 = s;
 993             BYTE* s2end = s2 + w*4;
 994             WORD* d2 = (WORD*)d;
 995             for(; s2 < s2end; s2 += 4, d2++)
 996             {
 997                 if(s2[3] < 0x1f)
 998                 {
 999                     *d2 = (WORD)((((((*d2&0x7c1f)*s2[3])>>5) + (*(DWORD*)s2&0x7c1f))&0x7c1f)
1000                         | (((((*d2&0x03e0)*s2[3])>>5) + (*(DWORD*)s2&0x03e0))&0x03e0));
1001                     /*                                  *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
1002                     | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
1003                     | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
1004                     */
1005                 }
1006             }
1007         }
1008         break;
1009     case MSP_YUY2:
1010         for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
1011         {
1012             unsigned int ia, c;
1013             BYTE* s2 = s;
1014             BYTE* s2end = s2 + w*4;
1015             DWORD* d2 = (DWORD*)d;
1016             for(; s2 < s2end; s2 += 8, d2++)
1017             {
1018                 ia = (s2[3]+s2[7])>>1;
1019                 if(ia < 0xff)
1020                 {
1021                     //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
1022                     //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
1023                     //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
1024                     //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
1025                     //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
1026
1027                     ia = (ia<<24)|(s2[7]<<16)|(ia<<8)|s2[3];
1028                     c = (s2[4]<<24)|(s2[5]<<16)|(s2[0]<<8)|s2[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
1029                     __asm
1030                     {
1031                             mov                 edi, d2
1032                             pxor                mm0, mm0
1033                             movd                mm2, c
1034                             punpcklbw   mm2, mm0
1035                             movd                mm3, [edi]
1036                             punpcklbw   mm3, mm0
1037                             movd                mm4, ia
1038                             punpcklbw   mm4, mm0
1039                             psraw               mm4, 1          //or else, overflow because psraw shift in sign bit
1040                             pmullw              mm3, mm4
1041                             psraw               mm3, 7
1042                             paddsw              mm3, mm2
1043                             packuswb    mm3, mm3
1044                             movd                [edi], mm3
1045                     };
1046                 }
1047             }
1048         }
1049         __asm emms;
1050         break;
1051     case MSP_YV12:
1052     case MSP_IYUV:
1053         {
1054             //dst.pitch = abs(dst.pitch);
1055             int h2 = h/2;
1056             if(!dst.pitchUV)
1057             {
1058                 dst.pitchUV = abs(dst.pitch)/2;
1059             }
1060             if(!dst.bitsU || !dst.bitsV)
1061             {
1062                 dst.bitsU = (BYTE*)dst.bits + abs(dst.pitch)*dst.h;
1063                 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
1064                 if(dst.type == MSP_YV12)
1065                 {
1066                     BYTE* p = dst.bitsU;
1067                     dst.bitsU = dst.bitsV;
1068                     dst.bitsV = p;
1069                 }
1070             }
1071             BYTE* dd[2];
1072             dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
1073             dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
1074             if(rd.top > rd.bottom)
1075             {
1076                 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1077                 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1078                 dst.pitchUV = -dst.pitchUV;
1079             }
1080
1081             BYTE* src_origin= (BYTE*)src.bits + src.pitch*rs.top + rs.left;
1082
1083             BYTE* ss[2];
1084             ss[0] = src_origin + src.pitch*src.h*2;//U
1085             ss[1] = src_origin + src.pitch*src.h*3;//V
1086
1087             AlphaBltYv12Luma( d, dst.pitch, w, h, src_origin + src.pitch*src.h, src_origin, src.pitch );
1088
1089             AlphaBltYv12Chroma( dd[0], dst.pitchUV, w, h2, ss[0], src_origin, src.pitch);
1090             AlphaBltYv12Chroma( dd[1], dst.pitchUV, w, h2, ss[1], src_origin, src.pitch);
1091
1092             __asm emms;
1093         }
1094         break;
1095     default:
1096         return E_NOTIMPL;
1097         break;
1098     }
1099
1100     //emmsÒª40¸öcpuÖÜÆÚ
1101     //__asm emms;
1102     return S_OK;
1103 }
1104
1105 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_P010(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1106 {
1107     const SubPicDesc& src = m_spd;
1108     SubPicDesc dst = *pTarget; // copy, because we might modify it
1109
1110     CRect rs(*pSrc), rd(*pDst);
1111
1112     if(dst.h < 0) {
1113         dst.h = -dst.h;
1114         rd.bottom = dst.h - rd.bottom;
1115         rd.top = dst.h - rd.top;
1116     }
1117
1118     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1119         return E_INVALIDARG;
1120     }
1121
1122     int w = rs.Width(), h = rs.Height();
1123
1124     //Y
1125     BYTE* s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1126     BYTE* d = static_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1127
1128     if(rd.top > rd.bottom) {
1129         d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1130
1131         dst.pitch = -dst.pitch;
1132     }
1133
1134     for(ptrdiff_t i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1135     {
1136         BYTE* s2 = s;
1137         BYTE* s2end = s2 + w*4;
1138         WORD* d2 = reinterpret_cast<WORD*>(d);
1139         for(; s2 < s2end; s2 += 4, d2++)
1140         {
1141             if(s2[3] < 0xff) {
1142                 d2[0] = ((d2[0]*s2[3])>>8) + (s2[1]<<8);
1143             }
1144         }
1145     }
1146
1147     //UV
1148     int h2 = h/2;
1149     if(!dst.pitchUV)
1150     {
1151         dst.pitchUV = abs(dst.pitch);
1152     }
1153     if(!dst.bitsU || !dst.bitsV)
1154     {
1155         dst.bitsU = static_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1156         dst.bitsV = dst.bitsU + 2;
1157     }
1158     BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
1159     if(rd.top > rd.bottom)
1160     {
1161         ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
1162         dst.pitchUV = -dst.pitchUV;
1163     }
1164
1165     s = static_cast<BYTE*>(src.bits) + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1166
1167     d = ddUV;
1168     int pitch = src.pitch;
1169     for(int j = 0; j < h2; j++, s += 2*src.pitch, d += dst.pitchUV )
1170     {
1171         BYTE* s2 = s;
1172         WORD* d2=reinterpret_cast<WORD*>(d);
1173         WORD* d2_end = reinterpret_cast<WORD*>(d+2*w);
1174         for( ; d2<d2_end; s2+=8, d2+=2)
1175         {
1176             unsigned int ia = (
1177                 s2[3]+          s2[3+4]+
1178                 s2[3+src.pitch]+s2[3+4+src.pitch]);
1179             if( ia!=0xFF*4 )
1180             {
1181                 d2[0] = (((d2[0])*ia)>>10) + ((s2[0] + s2[0+src.pitch])<<7);
1182                 d2[1] = (((d2[1])*ia)>>10) + ((s2[4] + s2[4+src.pitch])<<7);
1183             }
1184         }
1185     }
1186
1187     return S_OK;
1188 }
1189
1190 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_Yv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1191 {
1192     const SubPicDesc& src = m_spd;
1193     SubPicDesc dst = *pTarget; // copy, because we might modify it
1194
1195     CRect rs(*pSrc), rd(*pDst);
1196
1197     if(dst.h < 0) {
1198         dst.h = -dst.h;
1199         rd.bottom = dst.h - rd.bottom;
1200         rd.top = dst.h - rd.top;
1201     }
1202
1203     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1204         return E_INVALIDARG;
1205     }
1206
1207     int w = rs.Width(), h = rs.Height();
1208
1209     BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1210     BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1211
1212     if(rd.top > rd.bottom) {
1213         d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1214
1215         dst.pitch = -dst.pitch;
1216     }
1217
1218     for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1219         BYTE* s2 = s;
1220         BYTE* s2end = s2 + w*4;
1221         BYTE* d2 = d;
1222         for(; s2 < s2end; s2 += 4, d2++) {
1223             if(s2[3] < 0xff) {
1224                 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1225             }
1226         }
1227     }
1228     dst.pitch = abs(dst.pitch);
1229
1230     int h2 = h/2;
1231
1232     if(!dst.pitchUV) {
1233         dst.pitchUV = dst.pitch/2;
1234     }
1235
1236     BYTE* ss[2];
1237     ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1238     ss[1] = ss[0] + 4;
1239
1240     if(!dst.bitsU || !dst.bitsV) {
1241         dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1242         dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
1243
1244         if(dst.type == MSP_YV12) {
1245             BYTE* p = dst.bitsU;
1246             dst.bitsU = dst.bitsV;
1247             dst.bitsV = p;
1248         }
1249     }
1250
1251     BYTE* dd[2];
1252     dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
1253     dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
1254
1255     if(rd.top > rd.bottom) {
1256         dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1257         dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
1258         dst.pitchUV = -dst.pitchUV;
1259     }
1260
1261     for(ptrdiff_t i = 0; i < 2; i++) {
1262         s = ss[i];
1263         d = dd[i];
1264         BYTE* is = ss[1-i];
1265         for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, is += src.pitch*2) {
1266             BYTE* s2 = s;
1267             BYTE* s2end = s2 + w*4;
1268             BYTE* d2 = d;
1269             BYTE* is2 = is;
1270             for(; s2 < s2end; s2 += 8, d2++, is2 += 8) {
1271                 unsigned int ia = (s2[3]+s2[3+src.pitch]+is2[3]+is2[3+src.pitch])>>2;
1272                 if(ia < 0xff) {
1273                     *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1274                 }
1275             }
1276         }
1277     }
1278
1279     return S_OK;
1280 }
1281
1282 STDMETHODIMP CMemSubPic::AlphaBltAxyuAxyv_Nv12(const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget)
1283 {
1284     const SubPicDesc& src = m_spd;
1285     SubPicDesc dst = *pTarget; // copy, because we might modify it
1286
1287     CRect rs(*pSrc), rd(*pDst);
1288
1289     if(dst.h < 0) {
1290         dst.h = -dst.h;
1291         rd.bottom = dst.h - rd.bottom;
1292         rd.top = dst.h - rd.top;
1293     }
1294
1295     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1296         return E_INVALIDARG;
1297     }
1298
1299     int w = rs.Width(), h = rs.Height();
1300
1301     BYTE* s = (BYTE*)src.bits + src.pitch*rs.top + ((rs.left*src.bpp)>>3);
1302     BYTE* d = (BYTE*)dst.bits + dst.pitch*rd.top + rd.left;
1303
1304     if(rd.top > rd.bottom) {
1305         d = (BYTE*)dst.bits + dst.pitch*(rd.top-1) + rd.left;
1306
1307         dst.pitch = -dst.pitch;
1308     }
1309
1310     for(ptrdiff_t j = 0; j < h; j++, s += src.pitch, d += dst.pitch) {
1311         BYTE* s2 = s;
1312         BYTE* s2end = s2 + w*4;
1313         BYTE* d2 = d;
1314         for(; s2 < s2end; s2 += 4, d2++) {
1315             if(s2[3] < 0xff) {
1316                 d2[0] = ((d2[0]*s2[3])>>8) + s2[1];
1317             }
1318         }
1319     }
1320     dst.pitch = abs(dst.pitch);
1321
1322     int h2 = h/2;
1323
1324     if(!dst.pitchUV) {
1325         dst.pitchUV = dst.pitch;
1326     }
1327
1328     BYTE* ss[2];
1329     ss[0] = (BYTE*)src.bits + src.pitch*rs.top + rs.left*4;
1330     ss[1] = ss[0] + 4;
1331
1332     if(!dst.bitsU || !dst.bitsV) {
1333         dst.bitsU = (BYTE*)dst.bits + dst.pitch*dst.h;
1334         dst.bitsV = dst.bitsU + 1;
1335
1336         if(dst.type == MSP_NV21) {
1337             BYTE* p = dst.bitsU;
1338             dst.bitsU = dst.bitsV;
1339             dst.bitsV = p;
1340         }
1341     }
1342
1343     BYTE* dd[2];
1344     dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1345     dd[1] = dd[0]+1;
1346
1347     if(rd.top > rd.bottom) {
1348         dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1349         dd[1] = dd[0]+1;
1350         dst.pitchUV = -dst.pitchUV;
1351     }
1352
1353     for(ptrdiff_t i = 0; i < 2; i++) {
1354         s = ss[i];
1355         d = dd[i];
1356         BYTE* is = ss[1-i];
1357         for(ptrdiff_t j = 0; j < h2; j++, s += src.pitch*2, d += dst.pitchUV, is += src.pitch*2) {
1358             BYTE* s2 = s;
1359             BYTE* s2end = s2 + w*4;
1360             BYTE* d2 = d;
1361             BYTE* is2 = is;
1362             for(; s2 < s2end; s2 += 8, d2+=2, is2 += 8) {
1363                 unsigned int ia = (s2[3]+s2[3+src.pitch]+is2[3]+is2[3+src.pitch])>>2;
1364                 if(ia < 0xff) {
1365                     *d2 = ((*d2*ia)>>8) + ((s2[0]+s2[src.pitch])>>1);
1366                 }
1367             }
1368         }
1369     }
1370
1371     return S_OK;
1372 }
1373
1374 STDMETHODIMP CMemSubPic::AlphaBltAnv12_P010( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1375 {
1376     //fix me: check colorspace and log error
1377     const SubPicDesc& src = m_spd;
1378     SubPicDesc dst = *pTarget; // copy, because we might modify it
1379
1380     CRect rs(*pSrc), rd(*pDst);
1381     if(dst.h < 0)
1382     {
1383         dst.h = -dst.h;
1384         rd.bottom = dst.h - rd.bottom;
1385         rd.top = dst.h - rd.top;
1386     }
1387     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1388         return E_INVALIDARG;
1389     }
1390     int w = rs.Width(), h = rs.Height();
1391     bool bottom_down = rd.top > rd.bottom;
1392
1393     BYTE* d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
1394     if(bottom_down)
1395     {
1396         d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left*2;
1397         dst.pitch = -dst.pitch;
1398     }
1399
1400     //dst.pitch = abs(dst.pitch);
1401     int h2 = h/2;
1402     if(!dst.pitchUV)
1403     {
1404         dst.pitchUV = abs(dst.pitch);
1405     }
1406     dst.bitsU = reinterpret_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1407     BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left*2;
1408     if(bottom_down)
1409     {
1410         ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left*2;
1411         dst.pitchUV = -dst.pitchUV;
1412     }
1413
1414     BYTE* src_origin= reinterpret_cast<BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1415     BYTE *s = src_origin;
1416
1417     // equivalent:
1418     //   if( (reinterpret_cast<intptr_t>(s2)&15)==0 && (reinterpret_cast<intptr_t>(sa)&15)==0
1419     //     && (reinterpret_cast<intptr_t>(d2)&15)==0 )
1420     if( ((reinterpret_cast<intptr_t>(s) | static_cast<intptr_t>(src.pitch) |
1421         reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
1422     {
1423         for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1424         {
1425             BYTE* sa = s;
1426             BYTE* s2 = s + src.pitch*src.h;
1427             BYTE* s2end_mod16 = s2 + (w&~15);
1428             BYTE* s2end = s2 + w;
1429             BYTE* d2 = d;
1430
1431             for(; s2 < s2end_mod16; s2+=16, sa+=16, d2+=32)
1432             {
1433                 mix_16_y_p010_sse2(d2, s2, sa);
1434             }
1435             for( WORD* d3=reinterpret_cast<WORD*>(d2); s2 < s2end; s2++, sa++, d3++)
1436             {
1437                 if(sa[0] < 0xff)
1438                 {
1439                     d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
1440                 }
1441             }
1442         }
1443     }
1444     else //fix me: only a workaround for non-mod-16 size video
1445     {
1446         for(int i=0; i<h; i++, s += src.pitch, d += dst.pitch)
1447         {
1448             BYTE* sa = s;
1449             BYTE* s2 = s + src.pitch*src.h;
1450             BYTE* s2end_mod16 = s2 + (w&~15);
1451             BYTE* s2end = s2 + w;
1452             WORD* d2 = reinterpret_cast<WORD*>(d);
1453             for(; s2 < s2end; s2+=1, sa+=1, d2+=1)
1454             {
1455                 if(sa[0] < 0xff)
1456                 {
1457                     d2[0] = ((d2[0]*sa[0])>>8) + (s2[0]<<8);
1458                 }
1459             }
1460         }
1461     }
1462
1463     d = ddUV;
1464     BYTE* sa = src_origin;
1465     BYTE* s_uv = src_origin + src.pitch*src.h*2;//UV
1466     if( ((reinterpret_cast<intptr_t>(sa) | static_cast<intptr_t>(src.pitch) |
1467         reinterpret_cast<intptr_t>(d) | static_cast<intptr_t>(dst.pitch) ) & 15 )==0 )
1468     {
1469         for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1470         {
1471             BYTE* s_u2 = s_uv;
1472             BYTE* sa2 = sa;
1473             BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1474             BYTE* s_u2end = s_u2 + w;
1475             BYTE* d2 = d;
1476
1477             for(; s_u2 < s_u2end_mod16; s_u2+=16, sa2+=16, d2+=32)
1478             {
1479                 mix_16_uv_p010_sse2(d2, s_u2, sa2, src.pitch);
1480             }
1481
1482             for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1483             {
1484                 unsigned int ia = (
1485                     sa2[0]+          sa2[1]+
1486                     sa2[0+src.pitch]+sa2[1+src.pitch]);
1487                 if( ia!=0xFF*4 )
1488                 {
1489                     d3[0] = (((d3[0])*ia)>>10) + (s_u2[0]<<8);
1490                     d3[1] = (((d3[1])*ia)>>10) + (s_u2[1]<<8);
1491                 }
1492             }
1493         }
1494     }
1495     else
1496     {
1497         for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1498         {
1499             BYTE* s_u2 = s_uv;
1500             BYTE* sa2 = sa;
1501             BYTE* s_u2end = s_u2 + w;
1502             BYTE* d2 = d;
1503
1504             for( WORD* d3=reinterpret_cast<WORD*>(d2); s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1505             {
1506                 unsigned int ia = (
1507                     sa2[0]+          sa2[1]+
1508                     sa2[0+src.pitch]+sa2[1+src.pitch]);
1509                 if( ia!=0xFF*4 )
1510                 {
1511                     d3[0] = (((d3[0])*ia)>>10) + (s_u2[0]<<8);
1512                     d3[1] = (((d3[1])*ia)>>10) + (s_u2[1]<<8);
1513                 }
1514             }
1515         }
1516     }
1517     __asm emms;
1518 }
1519
1520 STDMETHODIMP CMemSubPic::AlphaBltAnv12_Nvxx( const RECT* pSrc, const RECT* pDst, SubPicDesc* pTarget )
1521 {
1522     //fix me: check colorspace and log error
1523     const SubPicDesc& src = m_spd;
1524     SubPicDesc dst = *pTarget; // copy, because we might modify it
1525
1526     CRect rs(*pSrc), rd(*pDst);
1527     if(dst.h < 0)
1528     {
1529         dst.h = -dst.h;
1530         rd.bottom = dst.h - rd.bottom;
1531         rd.top = dst.h - rd.top;
1532     }
1533     if(rs.Width() != rd.Width() || rs.Height() != abs(rd.Height())) {
1534         return E_INVALIDARG;
1535     }
1536     int w = rs.Width(), h = rs.Height();
1537     bool bottom_down = rd.top > rd.bottom;
1538
1539     BYTE* d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left;
1540     if(bottom_down)
1541     {
1542         d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left;
1543         dst.pitch = -dst.pitch;
1544     }
1545
1546     //dst.pitch = abs(dst.pitch);
1547     int h2 = h/2;
1548     if(!dst.pitchUV)
1549     {
1550         dst.pitchUV = abs(dst.pitch);
1551     }
1552     if(!dst.bitsU)
1553     {
1554         dst.bitsU = reinterpret_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
1555     }
1556     BYTE* ddUV = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left;
1557     if(bottom_down)
1558     {
1559         ddUV = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left;
1560         dst.pitchUV = -dst.pitchUV;
1561     }
1562
1563     BYTE* sa= reinterpret_cast<BYTE*>(src.bits) + src.pitch*rs.top + rs.left;
1564
1565     BYTE* s_uv = sa + src.pitch*src.h*2;//UV
1566
1567     AlphaBltYv12Luma( d, dst.pitch, w, h, sa + src.pitch*src.h, sa, src.pitch );
1568     if( ((reinterpret_cast<intptr_t>(sa) | static_cast<intptr_t>(src.pitch) |
1569         reinterpret_cast<intptr_t>(ddUV) | static_cast<intptr_t>(dst.pitchUV) ) & 15 )==0 )
1570     {
1571         BYTE* d = ddUV;
1572         int pitch = src.pitch;
1573         for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1574         {
1575             BYTE* s_u2 = s_uv;
1576             BYTE* sa2 = sa;
1577             BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1578             BYTE* s_u2end = s_u2 + w;
1579             BYTE* d2 = d;
1580
1581             for(; s_u2 < s_u2end_mod16; s_u2+=16, sa2+=16, d2+=16)
1582             {
1583                 mix_16_uv_nvxx_sse2(d2, s_u2, sa2, src.pitch);
1584             }
1585             for( BYTE* d3=d2; s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1586             {
1587                 unsigned int ia = (
1588                     sa2[0]+          sa2[1]+
1589                     sa2[0+src.pitch]+sa2[1+src.pitch]);
1590                 if( ia!=0xFF*4 )
1591                 {
1592                     d3[0] = (((d3[0])*ia)>>10) + s_u2[0];
1593                     d3[1] = (((d3[1])*ia)>>10) + s_u2[1];
1594                 }
1595             }
1596         }
1597     }
1598     else
1599     {
1600         BYTE* d = ddUV;
1601         int pitch = src.pitch;
1602         for(int j = 0; j < h2; j++, s_uv += src.pitch, sa += src.pitch*2, d += dst.pitchUV)
1603         {
1604             BYTE* s_u2 = s_uv;
1605             BYTE* sa2 = sa;
1606             BYTE* s_u2end_mod16 = s_u2 + (w&~15);
1607             BYTE* s_u2end = s_u2 + w;
1608             BYTE* d2 = d;
1609
1610             for( BYTE* d3=d2; s_u2 < s_u2end; s_u2+=2, sa2+=2, d3+=2)
1611             {
1612                 unsigned int ia = (
1613                     sa2[0]+          sa2[1]+
1614                     sa2[0+src.pitch]+sa2[1+src.pitch]);
1615                 if( ia!=0xFF*4 )
1616                 {
1617                     d3[0] = (((d3[0])*ia)>>10) + s_u2[0];
1618                     d3[1] = (((d3[1])*ia)>>10) + s_u2[1];
1619                 }
1620             }
1621         }
1622     }
1623
1624     __asm emms;
1625 }
1626
1627 STDMETHODIMP CMemSubPic::SetDirtyRectEx(CAtlList<CRect>* dirtyRectList )
1628 {
1629     //if(m_spd.type == MSP_YUY2 || m_spd.type == MSP_YV12 || m_spd.type == MSP_IYUV || m_spd.type == MSP_AYUV)
1630     if(dirtyRectList!=NULL)
1631     {
1632         POSITION pos = dirtyRectList->GetHeadPosition();
1633         if(m_spd.type == MSP_AYUV_PLANAR || m_alpha_blt_dst_type==MSP_IYUV || m_alpha_blt_dst_type==MSP_YV12
1634             || m_alpha_blt_dst_type==MSP_P010 || m_alpha_blt_dst_type==MSP_P016
1635             || m_alpha_blt_dst_type==MSP_NV12 || m_alpha_blt_dst_type==MSP_NV21 )
1636         {
1637             while(pos!=NULL)
1638             {
1639                 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1640                 cRectSrc.left &= ~15;
1641                 cRectSrc.right = (cRectSrc.right+15)&~15;
1642                 if(cRectSrc.right>m_spd.w)
1643                 {
1644                     cRectSrc.right = m_spd.w;
1645                 }
1646                 cRectSrc.top &= ~1;
1647                 cRectSrc.bottom = (cRectSrc.bottom+1)&~1;
1648             }
1649         }
1650         else if(m_spd.type == MSP_XY_AUYV || m_alpha_blt_dst_type==MSP_YUY2)
1651         {
1652             while(pos!=NULL)
1653             {
1654                 CRect& cRectSrc = dirtyRectList->GetNext(pos);
1655                 cRectSrc.left &= ~3;
1656                 cRectSrc.right = (cRectSrc.right+3)&~3;
1657             }
1658         }
1659     }
1660     return __super::SetDirtyRectEx(dirtyRectList);
1661 }
1662
1663 //
1664 // CMemSubPicAllocator
1665 //
1666
1667 CMemSubPicAllocator::CMemSubPicAllocator(int alpha_blt_dst_type, SIZE maxsize, int type/*=-1*/)
1668     : CSubPicExAllocatorImpl(maxsize, false, false)
1669     , m_alpha_blt_dst_type(alpha_blt_dst_type)
1670     , m_maxsize(maxsize)
1671     , m_type(type)
1672 {
1673     if(m_type==-1)
1674     {
1675         switch(alpha_blt_dst_type)
1676         {
1677         case MSP_YUY2:
1678             m_type = MSP_XY_AUYV;
1679             break;
1680         case MSP_AYUV:
1681             m_type = MSP_AYUV;
1682             break;
1683         case MSP_IYUV:
1684         case MSP_YV12:
1685         case MSP_P010:
1686         case MSP_P016:
1687         case MSP_NV12:
1688         case MSP_NV21:
1689             m_type = MSP_AYUV_PLANAR;
1690             break;
1691         default:
1692             m_type = MSP_RGBA;
1693             break;
1694         }
1695     }
1696 }
1697
1698 // ISubPicAllocatorImpl
1699
1700 bool CMemSubPicAllocator::AllocEx(bool fStatic, ISubPicEx** ppSubPic)
1701 {
1702         if(!ppSubPic) {
1703                 return false;
1704         }
1705     SubPicDesc spd;
1706     spd.w = m_maxsize.cx;
1707     spd.h = m_maxsize.cy;
1708     spd.bpp = 32;
1709     spd.pitch = (spd.w*spd.bpp)>>3;
1710     spd.type = m_type;
1711         spd.bits = DNew BYTE[spd.pitch*spd.h];
1712         if(!spd.bits) {
1713                 return false;
1714         }
1715         *ppSubPic = DNew CMemSubPic(spd, m_alpha_blt_dst_type);
1716         if(!(*ppSubPic)) {
1717                 return false;
1718         }
1719     (*ppSubPic)->AddRef();
1720         return true;
1721 }
1722
1723
1724