From 34d16392f3238a1a8d9ee035ef4c829d3ab2a1dc Mon Sep 17 00:00:00 2001 From: xy Date: Sat, 28 Jul 2012 11:00:02 +0800 Subject: [PATCH] Blur Optimization. [Part 5] --- src/subtitles/xy_filter.cpp | 57 ++++++++++--- test/unit_test/test_xy_filter.h | 25 +++++- test/unit_test/xy_filter_benchmark.h | 153 ++++++++++++++++++++++++++++++++++- 3 files changed, 220 insertions(+), 15 deletions(-) diff --git a/src/subtitles/xy_filter.cpp b/src/subtitles/xy_filter.cpp index 77f6531..9dee499 100644 --- a/src/subtitles/xy_filter.cpp +++ b/src/subtitles/xy_filter.cpp @@ -113,15 +113,15 @@ void xy_filter_c(float *dst, int width, int height, int stride, const float *fil sum = _mm_add_ps(sum, f4_1) -__forceinline void xy_filter_one_line_sse_v4(float *dst, int width, const float *filter, int filter_width) +__forceinline void xy_filter_one_line_sse_v6(float *dst, int width, const float *filter, int filter_width) { int xx_fix = width > filter_width ? 0 : filter_width - width; const float *filter_start = filter; float *dst2 = dst - filter_width; float *dst_endr = dst + width; - float *dst_end0 = dst_endr - filter_width; + float *dst_end0 = dst_endr - filter_width - 4; float *dst_endl = dst - xx_fix; - ASSERT(xx_fix==0 || dst_end0==dst_endl); + ASSERT(xx_fix==0 || dst_end0==dst_endl-4); ASSERT(filter_start == filter); filter_start += filter_width; @@ -171,17 +171,49 @@ __forceinline void xy_filter_one_line_sse_v4(float *dst, int width, const float _mm_store_ps(dst2, sum); } ASSERT(filter_start == filter); - for (;dst2=4*(width+filter_width) ); ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast(dst)|reinterpret_cast(filter))&15)==0 ); @@ -230,7 +263,7 @@ void xy_filter_sse_v4(float *dst, int width, int height, int stride, const float BYTE* end = dst_byte + height*stride; for( ; dst_byte(dst_byte), width, filter, filter_width); + xy_filter_one_line_sse_v6(reinterpret_cast(dst_byte), width, filter, filter_width); } } @@ -386,7 +419,7 @@ void xy_filter_sse(float *dst, int width, int height, int stride, const float *f } else { - xy_filter_sse_v4(dst, width, height, stride, filter, filter_width); + xy_filter_sse_v6(dst, width, height, stride, filter, filter_width); } } diff --git a/test/unit_test/test_xy_filter.h b/test/unit_test/test_xy_filter.h index b0bc1c6..835de7c 100644 --- a/test/unit_test/test_xy_filter.h +++ b/test/unit_test/test_xy_filter.h @@ -13,6 +13,7 @@ void xy_filter_one_line_sse(float *dst, int width, const float *filter, int filt { xy_filter_sse(dst, width, 1, ((width*4)+15)&~15, filter, filter_width); } +void xy_filter_one_line_sse_v6(float *dst, int width, const float *filter, int filter_width); template std::ostream& GetBufString(std::ostream& os, T* buf, int len) @@ -188,7 +189,7 @@ TEST_F(XyFilterTest, c_vs_sse) data1.copy(data0); data2.copy(data0); xy_filter_one_line_c(data1.data, data1.w, data1.filter_f, data1.ex_filter_width); - xy_filter_one_line_sse(data2.data, data2.w, data2.filter_f, data2.ex_filter_width); + xy_filter_one_line_sse_v6(data2.data, data2.w, data2.filter_f, data2.ex_filter_width); ASSERT_EQ(true, data2.operator==(data1)) < filter_width ? 0 : filter_width - width; + const float *filter_start = filter; + float *dst2 = dst - filter_width; + float *dst_endr = dst + width; + float *dst_end0 = dst_endr - filter_width - 4; + float *dst_endl = dst - xx_fix; + ASSERT(xx_fix==0 || dst_end0==dst_endl-4); + + ASSERT(filter_start == filter); + filter_start += filter_width; + const float *filter_end = filter_start; + + for (;dst2=4*(width+filter_width) ); + ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast(dst)|reinterpret_cast(filter))&15)==0 ); + + BYTE* dst_byte = reinterpret_cast(dst); + BYTE* end = dst_byte + height*stride; + for( ; dst_byte(dst_byte), width, filter, filter_width); + } +} class XyFilterTest : public ::testing::Test { @@ -998,9 +1148,8 @@ TEST_F(XyFilterTest, function ## _ ## width ## _ ## height ## _ ## FILTER_LENGTH } #define DUAL_TEST(width, height, FILTER_LENGTH, loop_num) \ - FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v2) \ FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v4) \ - FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v5) + FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v6) DUAL_TEST(128, 16, 28, 20000) DUAL_TEST(256, 16, 28, 20000) -- 2.11.4.GIT