From 538b1af2dc9249724d1ae4b8fa3ab972b43dffc8 Mon Sep 17 00:00:00 2001 From: xy Date: Wed, 18 Jul 2012 23:06:25 +0800 Subject: [PATCH] Let's get to float: new Gaussian blur implementation. --- src/subtitles/Rasterizer.cpp | 399 ++++++++++++++++++++---- src/subtitles/Rasterizer.h | 4 + src/subtitles/subtitles_vs2010.vcxproj | 3 + src/subtitles/subtitles_vs2010.vcxproj.filters | 3 + src/subtitles/xy_filter.cpp | 413 +++++++++++++++++++++++++ 5 files changed, 767 insertions(+), 55 deletions(-) create mode 100644 src/subtitles/xy_filter.cpp diff --git a/src/subtitles/Rasterizer.cpp b/src/subtitles/Rasterizer.cpp index 1457f47..86cd285 100644 --- a/src/subtitles/Rasterizer.cpp +++ b/src/subtitles/Rasterizer.cpp @@ -41,6 +41,8 @@ #define _IMPL_MIN _MIN #endif +typedef const UINT8 CUINT8, *PCUINT8; + //NOTE: signed or unsigned affects the result seriously #define COMBINE_AYUV(a, y, u, v) ((((((((int)(a))<<8)|y)<<8)|u)<<8)|v) @@ -50,7 +52,110 @@ *(y)=((color)>>16)&0xff;\ *(a)=((color)>>24)&0xff;\ } while(0) - + +class GaussianCoefficients +{ +public: + int g_r; + int g_w; + int g_w_ex; + float *g_f; + + double sigma; +public: + GaussianCoefficients(const double sigma) + { + g_r = 0; + g_w = 0; + g_w_ex = 0; + + g_f = NULL; + + this->sigma = 0; + init(sigma); + } + GaussianCoefficients(const GaussianCoefficients& priv) + :g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma),g_f(NULL) + ,g_w_ex(priv.g_w_ex) + { + if (this->g_w_ex > 0 && this != &priv) { + this->g_f = reinterpret_cast(xy_malloc(this->g_w_ex * sizeof(float))); + ASSERT(this->g_f); + memcpy(g_f, priv.g_f, this->g_w_ex * sizeof(g_f[0])); + } + } + + ~GaussianCoefficients() + { + xy_free(g_f); g_f=NULL; + } + +private: + int init(double sigma) + { + double a = -1 / (sigma * sigma * 2); + double exp_a = exp(a); + + double volume = 0; + + if (this->sigma == sigma) + return 0; + else + this->sigma = sigma; + + this->g_w = (int)ceil(sigma*3) | 1; + this->g_r = this->g_w / 2; + this->g_w_ex = (this->g_w + 3) & ~3; + + if (this->g_w_ex > 0) { + xy_free(this->g_f); + this->g_f = reinterpret_cast(xy_malloc(this->g_w_ex * sizeof(float))); + if (this->g_f == NULL) { + return -1; + } + } + + if (this->g_w > 0) { + volume = 0; + + double exp_0 = 1.0; + double exp_1 = exp_a; + double exp_2 = exp_1 * exp_1; + volume = exp_0; + this->g_f[this->g_r] = exp_0; + float* p_left = this->g_f+this->g_r-1; + float* p_right= this->g_f+this->g_r+1; + for(int i=0; ig_r;++i,p_left--,p_right++) + { + exp_0 *= exp_1; + exp_1 *= exp_2; + + *p_left = exp_0; + *p_right = exp_0; + + volume += exp_0; + volume += exp_0; + } + //equivalent: + // for (i = 0; i < this->g_w; ++i) { + // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 ); + // volume += this->g[i]; + // } + ASSERT(volume>0); + for (int i=0;ig_w;i++) + { + this->g_f[i] /= volume; + } + for (int i=this->g_w;ig_w_ex;i++) + { + this->g_f[i] = 0; + } + } + return 0; + } + +}; + class ass_synth_priv { public: @@ -71,9 +176,12 @@ public: double sigma; }; -struct ass_synth_priv_key + +// GaussianFilter = GaussianCoefficients or ass_synth_priv +template +struct GaussianFilterKey { - const double& operator()(const ass_synth_priv& x)const + const double& operator()(const GaussianFilter& x)const { return x.sigma; } @@ -128,7 +236,7 @@ ass_synth_priv::ass_synth_priv(const ass_synth_priv& priv):g_r(priv.g_r),g_w(pri ass_synth_priv::~ass_synth_priv() { free(g); g=NULL; - free(gt2); gt2=NULL; + free(gt2); gt2=NULL; } int ass_synth_priv::generate_tables(double sigma) @@ -154,13 +262,13 @@ int ass_synth_priv::generate_tables(double sigma) if (this->g_w > 0) { this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned)); this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned)); - if (this->g == NULL || this->gt2 == NULL) { + if (this->g == NULL || this->gt2 == NULL) { return -1; - } + } } if (this->g_w > 0) { - volume_start = 0; + volume_start = 0; double exp_0 = 1.0; double exp_1 = exp_a; @@ -223,7 +331,7 @@ int ass_synth_priv::generate_tables(double sigma) break; else if(volume < TARGET_VOLUME) { - volume_start = volume_factor; + volume_start = volume_factor; } else if(volume >= TARGET_VOLUME+MAX_VOLUME_ERROR) { @@ -263,10 +371,10 @@ int ass_synth_priv::generate_tables(double sigma) int last_mul = 0; unsigned *p_gt2 = this->gt2 + mx; *p_gt2 = 0; - for (int i = 1; i < 256; i++) { + for (int i = 1; i < 256; i++) { last_mul = last_mul+this->g[mx]; p_gt2 += this->g_w; - *p_gt2 = last_mul; + *p_gt2 = last_mul; //equivalent: // this->gt2[this->g_w * i+ mx] = this->g[mx] * i; } @@ -315,7 +423,7 @@ static void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2, int mx; const unsigned *m3 = m2 + src * mwidth; unsigned sum = 0; - for (mx = mwidth-1; mx >= r - x ; mx--) { + for (mx = mwidth-1; mx >= r - x ; mx--) { sum += m3[mx]; dstp[mx] += sum; } @@ -481,6 +589,10 @@ static void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2, } } +void xy_gaussian_blur(PUINT8 dst, int dst_stride, + PCUINT8 src, int width, int height, int stride, + const float *gt, int r, int gt_ex_width); + /** * \brief blur with [[1,2,1]. [2,4,2], [1,2,1]] kernel. */ @@ -728,50 +840,240 @@ bool Rasterizer::IsItReallyBlur( int fBlur, double fGaussianBlur ) return false; } return true; -} - +} + +// @return: true if actually a blur operation has done, or else false and output is leave unset. +// To Do: rewrite it or delete it +bool Rasterizer::OldFixedPointBlur(const Overlay& input_overlay, int fBlur, double fGaussianBlur, + SharedPtrOverlay output_overlay) +{ + using namespace ::boost::flyweights; + + ASSERT(IsItReallyBlur(fBlur, fGaussianBlur)); + if(!output_overlay) + { + return false; + } + output_overlay->CleanUp(); + + output_overlay->mOffsetX = input_overlay.mOffsetX; + output_overlay->mOffsetY = input_overlay.mOffsetY; + output_overlay->mWidth = input_overlay.mWidth; + output_overlay->mHeight = input_overlay.mHeight; + output_overlay->mOverlayWidth = input_overlay.mOverlayWidth; + output_overlay->mOverlayHeight = input_overlay.mOverlayHeight; + output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty; + + int bluradjust = 0; + if ( IsItReallyBlur(fBlur, fGaussianBlur) ) + { + if (fGaussianBlur > 0) + bluradjust += (int)(fGaussianBlur*3*8 + 0.5) | 1; + if (fBlur) + bluradjust += 8; + // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit + bluradjust = (bluradjust+7)&~7; + + output_overlay->mOffsetX -= bluradjust; + output_overlay->mOffsetY -= bluradjust; + output_overlay->mWidth += (bluradjust<<1); + output_overlay->mHeight += (bluradjust<<1); + output_overlay->mOverlayWidth += (bluradjust>>2); + output_overlay->mOverlayHeight += (bluradjust>>2); + } + else + { + return false; + } + + output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15; + + BYTE* body = reinterpret_cast(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight)); + if( body==NULL ) + { + return false; + } + output_overlay->mBody.reset(body, xy_free); + memset(body, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight); + BYTE* border = NULL; + if (!output_overlay->mfWideOutlineEmpty) + { + border = reinterpret_cast(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight)); + if (border==NULL) + { + return false; + } + output_overlay->mBorder.reset(border, xy_free); + memset(border, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight); + } + + //copy buffer + for(int i = 1; i >= 0; i--) + { + byte* plan_selected = i==0 ? body : border; + const byte* plan_input = i==0 ? input_overlay.mBody.get() : input_overlay.mBorder.get(); + + plan_selected += (bluradjust>>3) + (bluradjust>>3)*output_overlay->mOverlayPitch; + if ( plan_selected!=NULL && plan_input!=NULL ) + { + for (int j=0;jmOverlayPitch; + plan_input += input_overlay.mOverlayPitch; + } + } + } + + ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) ); + //flyweight, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1)); + // Do some gaussian blur magic + if ( fGaussianBlur > GAUSSIAN_BLUR_THREHOLD ) + { + byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border; + flyweight>, no_locking> fw_priv_blur(fGaussianBlur); + const ass_synth_priv& priv_blur = fw_priv_blur.get(); + if (output_overlay->mOverlayWidth>=priv_blur.g_w && output_overlay->mOverlayHeight>=priv_blur.g_w) + { + ass_gauss_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, output_overlay->mOverlayPitch, + priv_blur.gt2, priv_blur.g_r, priv_blur.g_w); + } + } + + for (int pass = 0; pass < fBlur; pass++) + { + if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3) + { + int pitch = output_overlay->mOverlayPitch; + byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border; + be_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch); + } + } + return true; +} + // @return: true if actually a blur operation has done, or else false and output is leave unset. bool Rasterizer::Blur(const Overlay& input_overlay, int fBlur, double fGaussianBlur, SharedPtrOverlay output_overlay) { using namespace ::boost::flyweights; + if (!(g_cpuid.m_flags & CCpuID::sse2)) + { + // C code path of floating point version is extreamly slow, + // so we fall back to fixed point version instead + return Rasterizer::OldFixedPointBlur(input_overlay, fBlur, fGaussianBlur, output_overlay); + } + ASSERT(IsItReallyBlur(fBlur, fGaussianBlur)); - if(!output_overlay) + if(!output_overlay || !IsItReallyBlur(fBlur, fGaussianBlur)) { return false; + } + if (input_overlay.mOverlayWidth<=0 || input_overlay.mOverlayHeight<=0) + { + return true; } - output_overlay->CleanUp(); + if (fGaussianBlur>0) + { + if (fBlur)//this insane thing should NEVER happen + { + SharedPtrOverlay tmp(new Overlay()); + + bool rv = GaussianBlur(input_overlay, fGaussianBlur, tmp); + ASSERT(rv); + rv = BeBlur(*tmp, fBlur, output_overlay); + ASSERT(rv); + } + else + { + bool rv = GaussianBlur(input_overlay, fGaussianBlur, output_overlay); + ASSERT(rv); + } + } + else if (fBlur) + { + bool rv = BeBlur(input_overlay, fBlur, output_overlay); + ASSERT(rv); + } + return true; +} + +bool Rasterizer::GaussianBlur( const Overlay& input_overlay, double fGaussianBlur, SharedPtrOverlay output_overlay ) +{ + using namespace ::boost::flyweights; - output_overlay->mOffsetX = input_overlay.mOffsetX; - output_overlay->mOffsetY = input_overlay.mOffsetY; - output_overlay->mWidth = input_overlay.mWidth; - output_overlay->mHeight = input_overlay.mHeight; - output_overlay->mOverlayWidth = input_overlay.mOverlayWidth; - output_overlay->mOverlayHeight = input_overlay.mOverlayHeight; + ASSERT(output_overlay); + output_overlay->CleanUp(); output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty; - int bluradjust = 0; - if ( IsItReallyBlur(fBlur, fGaussianBlur) ) - { - if (fGaussianBlur > 0) - bluradjust += (int)(fGaussianBlur*3*8 + 0.5) | 1; - if (fBlur) - bluradjust += 8; - // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit - bluradjust = (bluradjust+7)&~7; + ASSERT(fGaussianBlur > 0); + flyweight>, no_locking> fw_priv_blur(fGaussianBlur); + const GaussianCoefficients& priv_blur = fw_priv_blur.get(); - output_overlay->mOffsetX -= bluradjust; - output_overlay->mOffsetY -= bluradjust; - output_overlay->mWidth += (bluradjust<<1); - output_overlay->mHeight += (bluradjust<<1); - output_overlay->mOverlayWidth += (bluradjust>>2); - output_overlay->mOverlayHeight += (bluradjust>>2); - } + int bluradjust = priv_blur.g_r * 8; + output_overlay->mOffsetX = input_overlay.mOffsetX - bluradjust; + output_overlay->mOffsetY = input_overlay.mOffsetY - bluradjust; + output_overlay->mWidth = input_overlay.mWidth + (bluradjust<<1); + output_overlay->mHeight = input_overlay.mHeight + (bluradjust<<1); + output_overlay->mOverlayWidth = input_overlay.mOverlayWidth + (bluradjust>>2); + output_overlay->mOverlayHeight = input_overlay.mOverlayHeight + (bluradjust>>2); + + output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15; + + BYTE* blur_plan = reinterpret_cast(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight)); + //memset(blur_plan, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight); + + const BYTE* plan_input = input_overlay.mfWideOutlineEmpty ? input_overlay.mBody.get() : input_overlay.mBorder.get(); + ASSERT(output_overlay->mOverlayWidth>=priv_blur.g_w && output_overlay->mOverlayHeight>=priv_blur.g_w); + xy_gaussian_blur(blur_plan, output_overlay->mOverlayPitch, + plan_input, input_overlay.mOverlayWidth, input_overlay.mOverlayHeight, input_overlay.mOverlayPitch, + priv_blur.g_f, priv_blur.g_r, priv_blur.g_w_ex); + if (input_overlay.mfWideOutlineEmpty) + { + output_overlay->mBody.reset(blur_plan, xy_free); + } else { - return false; + output_overlay->mBorder.reset(blur_plan, xy_free); + + BYTE* body = reinterpret_cast(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight)); + if( body==NULL ) + { + return false; + } + output_overlay->mBody.reset(body, xy_free); + memset(body, 0, output_overlay->mOverlayPitch * (bluradjust>>3)); + body += (bluradjust>>3)*output_overlay->mOverlayPitch; + plan_input = input_overlay.mBody.get(); + ASSERT(plan_input); + for (int j=0;j>3)); + memcpy(body+(bluradjust>>3), plan_input, input_overlay.mOverlayWidth); + memset(body+(bluradjust>>3)+input_overlay.mOverlayWidth, 0, (bluradjust>>3)); + body += output_overlay->mOverlayPitch; + plan_input += input_overlay.mOverlayPitch; + } + memset(body, 0, output_overlay->mOverlayPitch * (bluradjust>>3)); } + return true; +} + +bool Rasterizer::BeBlur( const Overlay& input_overlay, int fBlur, SharedPtrOverlay output_overlay ) +{ + ASSERT(output_overlay); + output_overlay->CleanUp(); + output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty; + + ASSERT(fBlur>0); + int bluradjust = 8; + output_overlay->mOffsetX = input_overlay.mOffsetX - bluradjust; + output_overlay->mOffsetY = input_overlay.mOffsetY - bluradjust; + output_overlay->mWidth = input_overlay.mWidth + (bluradjust<<1); + output_overlay->mHeight = input_overlay.mHeight + (bluradjust<<1); + output_overlay->mOverlayWidth = input_overlay.mOverlayWidth + (bluradjust>>2); + output_overlay->mOverlayHeight = input_overlay.mOverlayHeight + (bluradjust>>2); output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15; @@ -805,27 +1107,14 @@ bool Rasterizer::Blur(const Overlay& input_overlay, int fBlur, double fGaussianB { for (int j=0;jmOverlayPitch; plan_input += input_overlay.mOverlayPitch; } } } - ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) ); - //flyweight, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1)); - // Do some gaussian blur magic - if ( fGaussianBlur > GAUSSIAN_BLUR_THREHOLD ) - { - byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border; - flyweight, no_locking> fw_priv_blur(fGaussianBlur); - const ass_synth_priv& priv_blur = fw_priv_blur.get(); - if (output_overlay->mOverlayWidth>=priv_blur.g_w && output_overlay->mOverlayHeight>=priv_blur.g_w) - { - ass_gauss_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, output_overlay->mOverlayPitch, - priv_blur.gt2, priv_blur.g_r, priv_blur.g_w); - } - } + ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) ); for (int pass = 0; pass < fBlur; pass++) { @@ -836,8 +1125,8 @@ bool Rasterizer::Blur(const Overlay& input_overlay, int fBlur, double fGaussianB be_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch); } } - return true; -} + return true; +} /////////////////////////////////////////////////////////////////////////// diff --git a/src/subtitles/Rasterizer.h b/src/subtitles/Rasterizer.h index 0150806..0a1e3f8 100644 --- a/src/subtitles/Rasterizer.h +++ b/src/subtitles/Rasterizer.h @@ -221,7 +221,11 @@ public: static bool Rasterize(const ScanLineData2& scan_line_data2, int xsub, int ysub, SharedPtrOverlay overlay); static bool IsItReallyBlur(int fBlur, double fGaussianBlur); + static bool OldFixedPointBlur(const Overlay& input_overlay, int fBlur, double fGaussianBlur, + SharedPtrOverlay output_overlay); static bool Blur(const Overlay& input_overlay, int fBlur, double fGaussianBlur, SharedPtrOverlay output_overlay); + static bool BeBlur(const Overlay& input_overlay, int fBlur, SharedPtrOverlay output_overlay); + static bool GaussianBlur(const Overlay& input_overlay, double fGaussianBlur, SharedPtrOverlay output_overlay); static SharedPtrByte CompositeAlphaMask(const SharedPtrOverlay& overlay, const CRect& clipRect, const GrayImage2* alpha_mask, diff --git a/src/subtitles/subtitles_vs2010.vcxproj b/src/subtitles/subtitles_vs2010.vcxproj index 797fcd5..6b38c41 100644 --- a/src/subtitles/subtitles_vs2010.vcxproj +++ b/src/subtitles/subtitles_vs2010.vcxproj @@ -207,6 +207,9 @@ + + Use + NotUsing NotUsing diff --git a/src/subtitles/subtitles_vs2010.vcxproj.filters b/src/subtitles/subtitles_vs2010.vcxproj.filters index 726ea61..ea0c3b2 100644 --- a/src/subtitles/subtitles_vs2010.vcxproj.filters +++ b/src/subtitles/subtitles_vs2010.vcxproj.filters @@ -96,6 +96,9 @@ Source Files + + Source Files + diff --git a/src/subtitles/xy_filter.cpp b/src/subtitles/xy_filter.cpp new file mode 100644 index 0000000..67d6344 --- /dev/null +++ b/src/subtitles/xy_filter.cpp @@ -0,0 +1,413 @@ +#include "stdafx.h" + +typedef const UINT8 CUINT8, *PCUINT8; + +/**** + * dst memory layout: + * 1. Source content starts from @dst; + * 2. Output content starts from @dst-@mwitdth; + * + * |- stride -| + * | <- @dst-@mwidth --------| <- @dst+0 --------------------| + * |- -|- -| + * |- margin -|- src width*height items -| + * |- mwidth*heigth items -|- -| + * |- do NOT need to init -|- -| + * |- when input -|- -| + * |---------------------------------------------------------| + **/ +void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width) +{ + ASSERT( stride>=4*(width+filter_width) ); + + int xx_fix = width > filter_width ? 0 : filter_width - width; + const float *filter_start = filter; + + BYTE* end = reinterpret_cast(dst) + height*stride; + BYTE* dst_byte = reinterpret_cast(dst); + for( ; dst_byte(dst_byte); + float *dst2 = dst_f - filter_width; + float *dst_endr = dst_f + width; + float *dst_end0 = dst_endr - filter_width; + float *dst_endl = dst_f - xx_fix; + ASSERT(xx_fix==0 || dst_end0==dst_endl); + + ASSERT(filter_start == filter); + filter_start += filter_width; + const float *filter_end = filter_start; + for (;dst2=4*(width+filter_width) ); + ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast(dst)|reinterpret_cast(filter))&15)==0 ); + + int xx_fix = width > filter_width ? 0 : filter_width - width; + const float *filter_start = filter; + BYTE* dst_byte = reinterpret_cast(dst); + BYTE* end = dst_byte + height*stride; + for( ; dst_byte(dst_byte); + float *dst2 = dst_f - filter_width; + float *dst_endr = dst_f + width; + float *dst_end0 = dst_endr - filter_width; + float *dst_endl = dst_f - xx_fix; + ASSERT(xx_fix==0 || dst_end0==dst_endl); + + ASSERT(filter_start == filter); + filter_start += filter_width; + const float *filter_end = filter_start; + + for (;dst2= @width + * if @dst_width>@width, the extra elements will be filled with 0. + **/ +void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride, + PCUINT8 src, int width, int height, int stride) +{ + PUINT8 dst_byte = reinterpret_cast(dst); + + PCUINT8 src_end = src + height*stride; + for( ; src(dst_byte); + + for (;src2(dst_byte)+dst_width; + for (;dst2=width ); + ASSERT( ((reinterpret_cast(dst)|dst_stride)&15)==0 ); + PUINT8 dst_byte = reinterpret_cast(dst); + + PCUINT8 src_end = src + height*stride; + for( ; src(dst_byte); + + for (;src2(src2)); + __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128()); + __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128()); + __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128()); + __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128()); + __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128()); + __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128()); + __m128 dst_f1 = _mm_cvtepi32_ps(src16_lo_lo); + __m128 dst_f2 = _mm_cvtepi32_ps(src16_lo_hi); + __m128 dst_f3 = _mm_cvtepi32_ps(src16_hi_lo); + __m128 dst_f4 = _mm_cvtepi32_ps(src16_hi_hi); + _mm_stream_ps(dst2, dst_f1); + _mm_stream_ps(dst2+4, dst_f2); + _mm_stream_ps(dst2+8, dst_f3); + _mm_stream_ps(dst2+12, dst_f4); + } + PCUINT8 src2_end = src + width; + for (;src2(dst_byte)+dst_width; + for (;dst2= @height. + * if @dst_width > @height, the extra elements will be filled with 0. + **/ +void xy_transpose(float *dst, int dst_width, int dst_stride, + const float *src, int width, int height, int src_stride) +{ + ASSERT(dst_width >= height); + PUINT8 dst_byte = reinterpret_cast(dst); + const float* src_end = src + width; + PCUINT8 src2_end = reinterpret_cast(src) + height*src_stride; + for( ; src(src); + + float *dst2 = reinterpret_cast(dst_byte); + for (;src2(src2); + } + float *dst2_end = reinterpret_cast(dst_byte) + dst_width; + for (;dst2= @height. + * if @dst_width > @height, the extra elements will be filled with 0. + **/ +void xy_transpose(UINT8 *dst, int dst_width, int dst_stride, + const float *src, int width, int height, int src_stride) +{ + ASSERT(dst_width >= height); + PUINT8 dst_byte = reinterpret_cast(dst); + const float* src_end = src + width; + PCUINT8 src2_end = reinterpret_cast(src) + height*src_stride; + for( ; src(src); + + UINT8 *dst2 = reinterpret_cast(dst_byte); + for (;src2(*reinterpret_cast(src2)+0.5); + } + UINT8 *dst2_end = reinterpret_cast(dst_byte) + dst_width; + for (;dst2gt_ex_width) + { + int o=0; + o=o/o; + exit(-1); + } + + int fwidth = (width+3)&~3; + int fstride = (fwidth + ex_mask_width)*sizeof(float); + int fheight = (height+3)&~3; + int fstride_ver = (fheight+ex_mask_width)*sizeof(float); + + PUINT8 buff_base = reinterpret_cast(xy_malloc(height*fstride + (fwidth + ex_mask_width)*fstride_ver)); + + float *hor_buff_base = reinterpret_cast(buff_base); + float *hor_buff = hor_buff_base + ex_mask_width; + + // byte to float + ASSERT( ((width+15)&~15)<=stride ); + xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride); + + // horizontal pass + xy_filter_sse(hor_buff, fwidth, height, fstride, gt, ex_mask_width); + + + // transpose + float *ver_buff_base = reinterpret_cast(buff_base + height*fstride); + float *ver_buff = ver_buff_base + ex_mask_width; + + int true_width = width+r*2; + xy_transpose(ver_buff, fheight, fstride_ver, hor_buff-r*2, true_width, height, fstride); + + // vertical pass + xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt, ex_mask_width); + + // transpose + int true_height = height + 2*r; + xy_transpose(dst, true_width, dst_stride, ver_buff-r*2, true_height, true_width, fstride_ver); + + xy_free(buff_base); + _mm_empty(); +} -- 2.11.4.GIT