From f06bab05b160529a5a7a386bf5fd2c5d6572f828 Mon Sep 17 00:00:00 2001 From: xy Date: Sat, 28 Jul 2012 18:47:34 +0800 Subject: [PATCH] Blur Optimization. [Part 6] --- src/subtitles/xy_filter.cpp | 173 +++++++++++++--- test/unit_test/unit_test.vcxproj | 4 +- test/unit_test/xy_filter_benchmark.h | 370 +++++++++++++++++++++++++++++++---- 3 files changed, 482 insertions(+), 65 deletions(-) diff --git a/src/subtitles/xy_filter.cpp b/src/subtitles/xy_filter.cpp index 9dee499..f7f0849 100644 --- a/src/subtitles/xy_filter.cpp +++ b/src/subtitles/xy_filter.cpp @@ -2,6 +2,31 @@ typedef const UINT8 CUINT8, *PCUINT8; +// +// ref: "Comparing floating point numbers" by Bruce Dawson +// http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm +// +bool AlmostEqual(float A, float B, int maxUlps=0) +{ + // Make sure maxUlps is non-negative and small enough that the + // default NAN won't compare as equal to anything. + ASSERT(maxUlps >= 0 && maxUlps < 4 * 1024 * 1024); + int aInt = *(int*)&A; + // Make aInt lexicographically ordered as a twos-complement int + if (aInt < 0) + aInt = 0x80000000 - aInt; + // Make bInt lexicographically ordered as a twos-complement int + int bInt = *(int*)&B; + if (bInt < 0) + bInt = 0x80000000 - bInt; + + int intDiff = abs(aInt - bInt); + if (intDiff <= maxUlps) + return true; + + return false; +} + /**** * See @xy_filter_c **/ @@ -112,6 +137,15 @@ void xy_filter_c(float *dst, int width, int height, int stride, const float *fil f4_1 = _mm_mul_ps(f4_1, src_3_6);\ sum = _mm_add_ps(sum, f4_1) +/**** + * @src4, @f4_1, @sum : __m128 + * @f4_1: const + * @sum : output + * @src4: undefined + **/ +#define XY_FILTER_4_1(src4, f4_1, sum) \ + src4 = _mm_mul_ps(src4, f4_1);\ + sum = _mm_add_ps(sum, src4); __forceinline void xy_filter_one_line_sse_v6(float *dst, int width, const float *filter, int filter_width) { @@ -173,7 +207,7 @@ __forceinline void xy_filter_one_line_sse_v6(float *dst, int width, const float ASSERT(filter_start == filter); for (;dst2 struct M128s { @@ -295,6 +333,21 @@ struct M128s }; template<> +struct M128s<1> +{ + __m128 x; + + template __forceinline __m128& GetAt() + { + return x; + } + __forceinline void Load(const float* src) + { + x = _mm_set1_ps(*src); + } +}; + +template<> struct M128s<0> { void Load(const float* src) @@ -307,14 +360,33 @@ struct Filter4 { static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s& filter128s, __m128& sum) { - Filter4::do_cal(src0_128, src4, filter128s, sum); - __m128 src4_128 = _mm_load_ps(src4+LENGTH-4); - XY_FILTER_4(src0_128, src4_128, filter128s.GetAt(), sum); + __m128 src4_128 = _mm_load_ps(src4); + { XY_FILTER_4(src0_128, src4_128, filter128s.GetAt(), sum); } + Filter4::do_cal(src4_128, src4+4, filter128s, sum); src0_128 = src4_128; } }; template +struct Filter4 +{ + static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s& filter128s, __m128& sum) + { + cal_tail(src0_128, src4, filter128s, sum); + } + template + static __forceinline void cal_tail(__m128& src0_128, const float * src4, M128s& filter128s, __m128& sum) + { + + } + template<> + static __forceinline void cal_tail<1>(__m128& src0_128, const float * src4, M128s& filter128s, __m128& sum) + { + { XY_FILTER_4_1(src0_128, filter128s.GetAt(), sum); } + } +}; + +template struct Filter4 { static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s& filter128s, __m128& sum) @@ -323,19 +395,42 @@ struct Filter4 }; template -struct FilterAllMargin +struct FilterAllLeftMargin { - static __forceinline void cal_left_margin(float * src, M128s& filter128s) + static __forceinline void cal(float * src, M128s& filter128s) + { + do_cal(src, filter128s); + } + template + static __forceinline void do_cal(float * src, M128s& filter128s) { //filter 4 __m128 src0 = _mm_setzero_ps(); __m128 sum = _mm_setzero_ps(); Filter4::do_cal(src0, src, filter128s, sum); _mm_store_ps(src-MARGIN_LENGTH, sum); - FilterAllMargin::cal_left_margin(src, filter128s); + FilterAllLeftMargin::do_cal<0>(src, filter128s); } + template<> + static __forceinline void do_cal<1>(float * src, M128s& filter128s) + { + //filter 4 + __m128 sum = _mm_setzero_ps(); + //Only one of the last 4 filter coefficiences is non-zero + _mm_store_ps(src-MARGIN_LENGTH, sum); + FilterAllLeftMargin::do_cal<0>(src, filter128s); + } +}; - static __forceinline void cal_right_margin(float * src, M128s& filter128s) +template +struct FilterAllRightMargin +{ + static __forceinline void cal(float * src, M128s& filter128s) + { + do_cal(src, filter128s); + } + template + static __forceinline void do_cal(float * src, M128s& filter128s) { //filter 4 { @@ -343,35 +438,59 @@ struct FilterAllMargin __m128 sum = _mm_setzero_ps(); Filter4::do_cal(src0, src+4, filter128s, sum); __m128 src4 = _mm_setzero_ps(); - XY_FILTER_4(src0, src4, filter128s.GetAt(), sum); + { XY_FILTER_4(src0, src4, filter128s.GetAt(), sum); } //store result _mm_store_ps(src, sum); } - FilterAllMargin::cal_right_margin(src+4, filter128s); + FilterAllRightMargin::do_cal<0>(src+4, filter128s); + } + template<> + static __forceinline void do_cal<1>(float * src, M128s& filter128s) + { + //filter 4 + { + __m128 src0 = _mm_load_ps(src); + __m128 sum = _mm_setzero_ps(); + Filter4::do_cal(src0, src+4, filter128s, sum); + //Only one of the last 4 filter coefficiences is non-zero + { XY_FILTER_4_1(src0, filter128s.GetAt(), sum); } + //store result + _mm_store_ps(src, sum); + } + FilterAllRightMargin::do_cal<0>(src+4, filter128s); } }; template -struct FilterAllMargin +struct FilterAllLeftMargin { - static __forceinline void cal_left_margin(float * src, M128s& filter128s) + template + static __forceinline void do_cal(float * src, M128s& filter128s) { } - static __forceinline void cal_right_margin(float * src, M128s& filter128s) +}; + +template +struct FilterAllRightMargin +{ + template + static __forceinline void do_cal(float * src, M128s& filter128s) { } }; /**** + * Equivalent: + * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 ); * See @xy_filter_c * Constrain: - * FILTER_LENGTH%4 == 0 && FILTER_LENGTH<=width + * FILTER_LENGTH<=width && width%4==0 **/ template void xy_filter_sse_template(float *dst, int width, int height, int stride, const float *filter) { ASSERT( stride>=4*(width+FILTER_LENGTH) ); - ASSERT( ((stride|(4*width)|(4*FILTER_LENGTH)|reinterpret_cast(dst)|reinterpret_cast(filter))&15)==0 ); + ASSERT( ((stride|(4*width)|reinterpret_cast(dst)|reinterpret_cast(filter))&15)==0 ); M128s filter128s; filter128s.Load(filter); @@ -384,9 +503,9 @@ void xy_filter_sse_template(float *dst, int width, int height, int stride, const float *dst2 = reinterpret_cast(dst_byte); //left margin - FilterAllMargin::cal_left_margin(dst2, filter128s); + FilterAllLeftMargin::cal(dst2, filter128s); float *dst_end1 = dst2 + width; - float *dst_end0 = dst_end1 - FILTER_LENGTH; + float *dst_end0 = dst_end1 - ((FILTER_LENGTH+3)&~3); for (;dst2::cal_right_margin(dst2, filter128s); + FilterAllRightMargin::cal(dst2, filter128s); } } @@ -409,13 +528,23 @@ void xy_filter_sse_template(float *dst, int width, int height, int stride, const void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width) { typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter); - const Filter filters[] = { xy_filter_sse_template<4>, xy_filter_sse_template<8>, xy_filter_sse_template<12>, xy_filter_sse_template<16>, - xy_filter_sse_template<20>, xy_filter_sse_template<24>, xy_filter_sse_template<28> + const Filter filters[] = { + NULL, + xy_filter_sse_template<1>, xy_filter_sse_template<4>, xy_filter_sse_template<4>, xy_filter_sse_template<4>, + xy_filter_sse_template<5>, xy_filter_sse_template<8>, xy_filter_sse_template<8>, xy_filter_sse_template<8>, + xy_filter_sse_template<9>, xy_filter_sse_template<12>, xy_filter_sse_template<12>, xy_filter_sse_template<12>, + xy_filter_sse_template<13>, xy_filter_sse_template<16>, xy_filter_sse_template<16>, xy_filter_sse_template<16>, + xy_filter_sse_template<17>, xy_filter_sse_template<20>, xy_filter_sse_template<20>, xy_filter_sse_template<20>, + xy_filter_sse_template<21>, xy_filter_sse_template<24>, xy_filter_sse_template<24>, xy_filter_sse_template<24>, + xy_filter_sse_template<25>, xy_filter_sse_template<28>, xy_filter_sse_template<28>, xy_filter_sse_template<28> }; if (filter_width<=28 && filter_width<=width) { - ASSERT(filter_width%4==0); - filters[(filter_width-1)/4](dst, width, height, stride, filter); + int tmp = filter_width; + // Skip tail zero, but we cannot (and don't have to) support more than 3 tail zeros currently. + while( AlmostEqual(filter[tmp-1],0.0f) && filter_width-tmp<3 ) + tmp--; + filters[tmp](dst, width, height, stride, filter); } else { diff --git a/test/unit_test/unit_test.vcxproj b/test/unit_test/unit_test.vcxproj index f1d0da1..b2b0f6c 100644 --- a/test/unit_test/unit_test.vcxproj +++ b/test/unit_test/unit_test.vcxproj @@ -46,7 +46,7 @@ $(ProjectDir)bin\$(Configuration)\ - $(SolutionDir)\..\..\..\..\test\gtest-1.6.0\include;$(IncludePath) + $(SolutionDir)\..\..\..\..\test\gtest-1.6.0\include;$(SolutionDir)\..\..\..\..\src\subtitles;$(IncludePath) $(ProjectDir)bin\$(Configuration)\ @@ -58,7 +58,7 @@ true $(SolutionDir)..\..\..\..\lib\ - gtestDU.lib;%(AdditionalDependencies) + gtestDU.lib;subtitlesDU.lib;%(AdditionalDependencies) Console diff --git a/test/unit_test/xy_filter_benchmark.h b/test/unit_test/xy_filter_benchmark.h index 24f33ba..618657d 100644 --- a/test/unit_test/xy_filter_benchmark.h +++ b/test/unit_test/xy_filter_benchmark.h @@ -5,6 +5,7 @@ #include #include #include +#include "xy_malloc.h" typedef const UINT8 CUINT8, *PCUINT8; @@ -863,7 +864,7 @@ struct FilterAllMargin * FILTER_LENGTH%4 == 0 && FILTER_LENGTH<=width **/ template -void xy_filter_sse_template(float *dst, int width, int height, int stride, const float *filter) +void xy_filter_sse_template_v0(float *dst, int width, int height, int stride, const float *filter) { ASSERT( stride>=4*(width+FILTER_LENGTH) ); ASSERT( ((stride|(4*width)|(4*FILTER_LENGTH)|reinterpret_cast(dst)|reinterpret_cast(filter))&15)==0 ); @@ -901,8 +902,8 @@ void xy_filter_sse_template(float *dst, int width, int height, int stride, const void xy_filter_sse_v5(float *dst, int width, int height, int stride, const float *filter, int filter_width) { typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter); - const Filter filters[] = { xy_filter_sse_template<4>, xy_filter_sse_template<8>, xy_filter_sse_template<12>, xy_filter_sse_template<16>, - xy_filter_sse_template<20>, xy_filter_sse_template<24>, xy_filter_sse_template<28> + const Filter filters[] = { xy_filter_sse_template_v0<4>, xy_filter_sse_template_v0<8>, xy_filter_sse_template_v0<12>, xy_filter_sse_template_v0<16>, + xy_filter_sse_template_v0<20>, xy_filter_sse_template_v0<24>, xy_filter_sse_template_v0<28> }; if (filter_width<=28 && filter_width<=width) { @@ -1068,71 +1069,349 @@ void xy_filter_sse_v6(float *dst, int width, int height, int stride, const float } } +// v7 + +/**** + * @src4, @f4_1, @sum : __m128 + * @f4_1: const + * @sum : output + * @src4: undefined + **/ +#define XY_FILTER_4_1(src4, f4_1, sum) \ + src4 = _mm_mul_ps(src4, f4_1);\ + sum = _mm_add_ps(sum, src4); + +/**** + * Constrain: + * LENGTH%4 == 0 || LENGTH%4 == 1 + **/ +template +struct M128s_V1 +{ + __m128 x; + M128s_V1 next; + + template __forceinline __m128& GetAt() + { + return next.GetAt(); + } + template<> __forceinline __m128& GetAt<0>() + { + return x; + } + + template __forceinline __m128& GetAt() + { + return GetAt(); + } + + __forceinline void Load(const float* src) + { + x = _mm_load_ps(src); + next.Load(src+4); + } +}; + +template<> +struct M128s_V1<1> +{ + __m128 x; + + template __forceinline __m128& GetAt() + { + return x; + } + __forceinline void Load(const float* src) + { + x = _mm_set1_ps(*src); + } +}; + +template<> +struct M128s_V1<0> +{ + void Load(const float* src) + { + } +}; + +template +struct Filter4_V1 +{ + static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s_V1& filter128s, __m128& sum) + { + __m128 src4_128 = _mm_load_ps(src4); + { XY_FILTER_4(src0_128, src4_128, filter128s.GetAt(), sum); } + Filter4_V1::do_cal(src4_128, src4+4, filter128s, sum); + src0_128 = src4_128; + } +}; + +template +struct Filter4_V1 +{ + static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s_V1& filter128s, __m128& sum) + { + cal_tail(src0_128, src4, filter128s, sum); + } + template + static __forceinline void cal_tail(__m128& src0_128, const float * src4, M128s_V1& filter128s, __m128& sum) + { + + } + template<> + static __forceinline void cal_tail<1>(__m128& src0_128, const float * src4, M128s_V1& filter128s, __m128& sum) + { + { XY_FILTER_4_1(src0_128, filter128s.GetAt(), sum); } + } +}; + +template +struct Filter4_V1 +{ + static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s_V1& filter128s, __m128& sum) + { + } +}; + +template +struct FilterAllLeftMargin_V1 +{ + static __forceinline void cal(float * src, M128s_V1& filter128s) + { + do_cal(src, filter128s); + } + template + static __forceinline void do_cal(float * src, M128s_V1& filter128s) + { + //filter 4 + __m128 src0 = _mm_setzero_ps(); + __m128 sum = _mm_setzero_ps(); + Filter4_V1::do_cal(src0, src, filter128s, sum); + _mm_store_ps(src-MARGIN_LENGTH, sum); + FilterAllLeftMargin_V1::do_cal<0>(src, filter128s); + } + template<> + static __forceinline void do_cal<1>(float * src, M128s_V1& filter128s) + { + //filter 4 + __m128 sum = _mm_setzero_ps(); + //Only one of the last 4 filter coefficiences is non-zero + _mm_store_ps(src-MARGIN_LENGTH, sum); + FilterAllLeftMargin_V1::do_cal<0>(src, filter128s); + } +}; + +template +struct FilterAllRightMargin_V1 +{ + static __forceinline void cal(float * src, M128s_V1& filter128s) + { + do_cal(src, filter128s); + } + template + static __forceinline void do_cal(float * src, M128s_V1& filter128s) + { + //filter 4 + { + __m128 src0 = _mm_load_ps(src); + __m128 sum = _mm_setzero_ps(); + Filter4_V1::do_cal(src0, src+4, filter128s, sum); + __m128 src4 = _mm_setzero_ps(); + { XY_FILTER_4(src0, src4, filter128s.GetAt(), sum); } + //store result + _mm_store_ps(src, sum); + } + FilterAllRightMargin_V1::do_cal<0>(src+4, filter128s); + } + template<> + static __forceinline void do_cal<1>(float * src, M128s_V1& filter128s) + { + //filter 4 + { + __m128 src0 = _mm_load_ps(src); + __m128 sum = _mm_setzero_ps(); + Filter4_V1::do_cal(src0, src+4, filter128s, sum); + //Only one of the last 4 filter coefficiences is non-zero + { XY_FILTER_4_1(src0, filter128s.GetAt(), sum); } + //store result + _mm_store_ps(src, sum); + } + FilterAllRightMargin_V1::do_cal<0>(src+4, filter128s); + } +}; + +template +struct FilterAllLeftMargin_V1 +{ + template + static __forceinline void do_cal(float * src, M128s_V1& filter128s) + { + } +}; + +template +struct FilterAllRightMargin_V1 +{ + template + static __forceinline void do_cal(float * src, M128s_V1& filter128s) + { + } +}; + +/**** + * Equivalent: + * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 ); + * See @xy_filter_c + * Constrain: + * FILTER_LENGTH<=width && width%4==0 + **/ +template +void xy_filter_sse_template_v1(float *dst, int width, int height, int stride, const float *filter) +{ + ASSERT( stride>=4*(width+FILTER_LENGTH) ); + ASSERT( ((stride|(4*width)|(4*FILTER_LENGTH)|reinterpret_cast(dst)|reinterpret_cast(filter))&15)==0 ); + + M128s_V1 filter128s; + filter128s.Load(filter); + + const float *filter_start = filter; + BYTE* dst_byte = reinterpret_cast(dst); + BYTE* end = dst_byte + height*stride; + for( ; dst_byte(dst_byte); + + //left margin + FilterAllLeftMargin_V1::cal(dst2, filter128s); + float *dst_end1 = dst2 + width; + float *dst_end0 = dst_end1 - ((FILTER_LENGTH+3)&~3); + for (;dst2::do_cal(src0, src, filter128s, sum); + //store result + _mm_store_ps(dst2, sum); + } + FilterAllRightMargin_V1::cal(dst2, filter128s); + } +} + +/**** + * See @xy_filter_c + **/ +void xy_filter_sse_v7(float *dst, int width, int height, int stride, const float *filter, int filter_width) +{ + typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter); + const Filter filters[] = { + NULL, + xy_filter_sse_template_v1<1>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>, + xy_filter_sse_template_v1<5>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>, + xy_filter_sse_template_v1<9>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>, + xy_filter_sse_template_v1<13>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>, + xy_filter_sse_template_v1<17>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>, + xy_filter_sse_template_v1<21>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>, + xy_filter_sse_template_v1<25>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28> + }; + if (filter_width<=28 && filter_width<=width) + { + int tmp = filter_width; + while(filter[tmp-1]==0.0f && filter_width-tmp<3)//Skip tail zero, but limited to 3. We cannot support more current. + tmp--; + filters[tmp](dst, width, height, stride, filter); + } + else + { + xy_filter_sse_v6(dst, width, height, stride, filter, filter_width); + } +} + +// v7 end + class XyFilterTest : public ::testing::Test { public: - static const int MAX_FILTER_LENGTH = 128; - static const int MAX_DATA_BUFF_SIZE = 512*1024; + static const int MAX_FILTER_LENGTH = 256; + static const int MAX_DATA_BUFF_SIZE = 512*512; + static const int MAX_BUFF_SIZE = MAX_DATA_BUFF_SIZE + 2*MAX_FILTER_LENGTH; + + float *buff_base; - float filter_f_base[MAX_FILTER_LENGTH+3]; float *filter_f; - float data_base[MAX_DATA_BUFF_SIZE+3]; float *data; int w, h, pitch; int filter_width; + int ex_filter_width; -protected: - virtual void SetUp() + XyFilterTest():buff_base(NULL),filter_f(NULL),data(NULL) { + buff_base = (float*)xy_malloc(MAX_BUFF_SIZE*sizeof(float)); + data = buff_base + MAX_FILTER_LENGTH; + filter_f = data + MAX_DATA_BUFF_SIZE; + } + ~XyFilterTest() + { + xy_free(buff_base);buff_base=NULL; + } + + const XyFilterTest& copy (const XyFilterTest& rhs) + { + w = rhs.w; + h = rhs.h; + pitch = rhs.pitch; + filter_width = rhs.filter_width; + ex_filter_width = rhs.ex_filter_width; + + memcpy(buff_base, rhs.buff_base, MAX_BUFF_SIZE*sizeof(float)); + return *this; } void FillRandData(int w, int h, int filter_width) { this->w = w; - this->h = h; + this->h = h; this->filter_width = filter_width; - ASSERT(w%4==0); - ASSERT(this->filter_width <= MAX_FILTER_LENGTH); - //ASSERT(this->filter_width%4==0); - - this->pitch = ((w+filter_width+3)&~3)*sizeof(float); - ASSERT(this->pitch*this->h <= MAX_DATA_BUFF_SIZE); + this->ex_filter_width = ((filter_width+3)&~3); + this->pitch = ((w+ex_filter_width+3)&~3)*sizeof(float); - data = data_base + filter_width; - if ((int)(data)&15) - { - data = (float*)(((int)data + 15) & ~15); - } - filter_f = filter_f_base; - if ((int)(filter_f)&15) - { - filter_f = (float*)(((int)filter_f + 15) & ~15); - } + ASSERT(this->pitch*this->h <= MAX_DATA_BUFF_SIZE); for (int i=0;i