From b06b377437d001e9c23b1ffd7d9c3118a80e018a Mon Sep 17 00:00:00 2001 From: xy Date: Mon, 9 Jul 2012 21:54:18 +0800 Subject: [PATCH] Deal with head/tail non-aligned part (For P010/P016 Chroma) --- src/subpic/MemSubPic.cpp | 19 ++++++++--- src/subpic/xy_intrinsics.h | 74 +++++++++++++++++++++++++++++++++------- test/unit_test/test_alphablend.h | 14 +++++--- 3 files changed, 86 insertions(+), 21 deletions(-) diff --git a/src/subpic/MemSubPic.cpp b/src/subpic/MemSubPic.cpp index 9b4d9da..5314010 100644 --- a/src/subpic/MemSubPic.cpp +++ b/src/subpic/MemSubPic.cpp @@ -1295,13 +1295,24 @@ HRESULT CMemSubPic::AlphaBltAnv12_P010( const BYTE* src_a, const BYTE* src_y, co } //UV int h2 = h/2; - BYTE* d = dst_uv; - if( ((reinterpret_cast(src_a) | reinterpret_cast(src_uv) | static_cast(src_pitch) | - reinterpret_cast(dst_uv) | static_cast(dst_pitch) ) & 15 )==0 ) + BYTE* d = dst_uv; + if( ( + ((reinterpret_cast(src_a) ^ reinterpret_cast(src_uv)) + |(reinterpret_cast(src_a) ^ reinterpret_cast(dst_uv)) + | static_cast(src_pitch) + | static_cast(dst_pitch) ) & 15) ==0 && + w > 16 ) { + int head = (16-(reinterpret_cast(src_a)&15))&15; + int tail = (w-head) & 15; + int w00 = w - head - tail; + + ASSERT(w>0);//the calls to mix may failed if w==0 for(int j = 0; j < h2; j++, src_uv += src_pitch, src_a += src_pitch*2, d += dst_pitch) { - hleft_vmid_mix_uv_p010_sse2(d, w, src_uv, src_a, src_pitch); + hleft_vmid_mix_uv_p010_c2(d, head, src_uv, src_a, src_pitch); + hleft_vmid_mix_uv_p010_sse2(d+2*head, w00, src_uv+head, src_a+head, src_pitch, head>0 ? -1 : 0); + hleft_vmid_mix_uv_p010_c2(d+2*(head+w00), tail, src_uv+head+w00, src_a+head+w00, src_pitch, (w00+head)>0 ? -1 : 0); } } else diff --git a/src/subpic/xy_intrinsics.h b/src/subpic/xy_intrinsics.h index f542f3d..2ad3dbe 100644 --- a/src/subpic/xy_intrinsics.h +++ b/src/subpic/xy_intrinsics.h @@ -639,10 +639,61 @@ static __forceinline void hleft_vmid_mix_uv_p010_c(BYTE* dst, int w, const BYTE* } } -static __forceinline void hleft_vmid_mix_uv_p010_sse2(BYTE* dst, int w, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0) +//0<=w15<=15 +static __forceinline void hleft_vmid_mix_uv_p010_c2(BYTE* dst, int w15, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0) { + ASSERT(w15>=0 && w15<=15 && (w15&1)==0 ); + int last_alpha = (am[last_src_id]+am[src_pitch+last_src_id]+1)/2; + WORD* dst_word = reinterpret_cast(dst); + +#ifdef XY_UNIT_TEST +# define _hleft_vmid_mix_uv_p010_c2_CLIP(tmp) tmp ^= (tmp^0xffff)&((0xffff-tmp)>>31);/*if(tmp>0xffff) tmp = 0xffff;*/ +#else +# define _hleft_vmid_mix_uv_p010_c2_CLIP(tmp) +#endif + + switch(w15) + { + case 14: +#define _hleft_vmid_mix_uv_p010_c2_mix_2 \ + int ia = (am[0]+am[0+src_pitch]+1)/2;\ + int tmp2 = (am[1]+am[1+src_pitch]+1)/2;\ + last_alpha = (last_alpha + tmp2 + 1)/2;\ + ia = (ia + last_alpha + 1)/2;\ + last_alpha = tmp2;\ + \ + if( ia!=0xFF )\ + {\ + int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);\ + _hleft_vmid_mix_uv_p010_c2_CLIP(tmp);\ + dst_word[0] = tmp;\ + tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);\ + _hleft_vmid_mix_uv_p010_c2_CLIP(tmp);\ + dst_word[1] = tmp;\ + } src+=2, am+=2, dst_word+=2 + + { _hleft_vmid_mix_uv_p010_c2_mix_2; } + case 12: + { _hleft_vmid_mix_uv_p010_c2_mix_2; } + case 10: + { _hleft_vmid_mix_uv_p010_c2_mix_2; } + case 8: + { _hleft_vmid_mix_uv_p010_c2_mix_2; } + case 6: + { _hleft_vmid_mix_uv_p010_c2_mix_2; } + case 4: + { _hleft_vmid_mix_uv_p010_c2_mix_2; } + case 2: + { _hleft_vmid_mix_uv_p010_c2_mix_2; } + } +} + +// am[last_src_id] valid && w&15=0 +static __forceinline void hleft_vmid_mix_uv_p010_sse2(BYTE* dst, int w00, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0) +{ + ASSERT( (((int)dst | w00 | (int)src | (int)am | src_pitch)&15)==0 ); __m128i last_alpha = _mm_cvtsi32_si128( (am[last_src_id]+am[src_pitch+last_src_id]+1)<<7 ); - const BYTE* end_mod16 = src + (w&~15); + const BYTE* end_mod16 = src + (w00&~15); for(; src < end_mod16; src+=16, am+=16, dst+=32) { //important! @@ -704,7 +755,6 @@ static __forceinline void hleft_vmid_mix_uv_p010_sse2(BYTE* dst, int w, const BY dst_y = _mm_adds_epu16(dst_y, lo); _mm_store_si128( reinterpret_cast<__m128i*>(dst+16), dst_y ); } - hleft_vmid_mix_uv_p010_c(dst, w&15, src, am, src_pitch, w>15?-1:0); } static __forceinline void hleft_vmid_mix_uv_nv12_c(BYTE* dst, int w, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0) @@ -746,28 +796,28 @@ static __forceinline void hleft_vmid_mix_uv_nv12_c2(BYTE* dst, int w15, const BY dst[0] = (((dst[0])*ia)>>8) + src[0];\ dst[1] = (((dst[1])*ia)>>8) + src[1];\ }\ - src+=2, am+=2, dst+=2; + src+=2, am+=2, dst+=2 - { _hleft_vmid_mix_uv_nv12_c2_mix_2 } + { _hleft_vmid_mix_uv_nv12_c2_mix_2; } case 12: - { _hleft_vmid_mix_uv_nv12_c2_mix_2 } + { _hleft_vmid_mix_uv_nv12_c2_mix_2; } case 10: - { _hleft_vmid_mix_uv_nv12_c2_mix_2 } + { _hleft_vmid_mix_uv_nv12_c2_mix_2; } case 8: - { _hleft_vmid_mix_uv_nv12_c2_mix_2 } + { _hleft_vmid_mix_uv_nv12_c2_mix_2; } case 6: - { _hleft_vmid_mix_uv_nv12_c2_mix_2 } + { _hleft_vmid_mix_uv_nv12_c2_mix_2; } case 4: - { _hleft_vmid_mix_uv_nv12_c2_mix_2 } + { _hleft_vmid_mix_uv_nv12_c2_mix_2; } case 2: - { _hleft_vmid_mix_uv_nv12_c2_mix_2 } + { _hleft_vmid_mix_uv_nv12_c2_mix_2; } } } // am[last_src_id] valid && w&15=0 static __forceinline void hleft_vmid_mix_uv_nv12_sse2(BYTE* dst, int w00, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0) { - ASSERT( ((int)dst | w00 | (int)src | (int)am | src_pitch)&15==0 ); + ASSERT( (((int)dst | w00 | (int)src | (int)am | src_pitch)&15)==0 ); __m128i last_alpha = _mm_cvtsi32_si128( (am[last_src_id]+am[src_pitch+last_src_id]+1)<<7 ); const BYTE* end_mod16 = src + w00; for(; src < end_mod16; src+=16, am+=16, dst+=16) diff --git a/test/unit_test/test_alphablend.h b/test/unit_test/test_alphablend.h index 2dbd661..2b73f0d 100644 --- a/test/unit_test/test_alphablend.h +++ b/test/unit_test/test_alphablend.h @@ -259,15 +259,17 @@ TEST_F(AlphaBlendTest, Check_hleft_vmid_mix_uv_p010) data2 = data1; hleft_vmid_mix_uv_p010_c( data1.dst, w, data1.src, data1.alpha, pitch); - hleft_vmid_mix_uv_p010_sse2( data2.dst, w, data2.src, data2.alpha, pitch); + + hleft_vmid_mix_uv_p010_sse2( data2.dst, w&~15, data2.src, data2.alpha, pitch); + hleft_vmid_mix_uv_p010_c2( data2.dst+(w&~15), w&15, data2.src+(w&~15), data2.alpha+(w&~15), pitch, (w&~15)>0?-1:0); ASSERT_EQ(true, data1==data0) - <<"pitch "<0?-1:0); ASSERT_EQ(true, data1==data2) - <<"pitch "<