Let's get to float: new Gaussian blur implementation.
[xy_vsfilter.git] / src / subtitles / xy_filter.cpp
blob67d634414137a1f64d09523faf75a9c38c37d847
1 #include "stdafx.h"
3 typedef const UINT8 CUINT8, *PCUINT8;
5 /****
6 * dst memory layout:
7 * 1. Source content starts from @dst;
8 * 2. Output content starts from @dst-@mwitdth;
9 *
10 * |- stride -|
11 * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
12 * |- -|- -|
13 * |- margin -|- src width*height items -|
14 * |- mwidth*heigth items -|- -|
15 * |- do NOT need to init -|- -|
16 * |- when input -|- -|
17 * |---------------------------------------------------------|
18 **/
19 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
21 ASSERT( stride>=4*(width+filter_width) );
23 int xx_fix = width > filter_width ? 0 : filter_width - width;
24 const float *filter_start = filter;
26 BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
27 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
28 for( ; dst_byte<end; dst_byte+=stride )
30 float *dst_f = reinterpret_cast<float*>(dst_byte);
31 float *dst2 = dst_f - filter_width;
32 float *dst_endr = dst_f + width;
33 float *dst_end0 = dst_endr - filter_width;
34 float *dst_endl = dst_f - xx_fix;
35 ASSERT(xx_fix==0 || dst_end0==dst_endl);
37 ASSERT(filter_start == filter);
38 filter_start += filter_width;
39 const float *filter_end = filter_start;
40 for (;dst2<dst_endl;dst2++, filter_start--)//left margin
42 const float *src = dst_f;
43 float sum = 0;
44 for(const float* f=filter_start;f<filter_end;f++, src++)
46 sum += src[0] * f[0];
48 *dst2 = sum;
50 for (;dst2<dst_f;dst2++, filter_start--, filter_end--)//if width < filter_width
52 const float *src = dst_f;
53 float sum = 0;
54 for(const float* f=filter_start;f<filter_end;f++, src++)
56 sum += src[0] * f[0];
58 *dst2 = sum;
60 ASSERT(filter_start==filter);
61 for (;dst2<dst_end0;dst2++)
63 const float *src = dst2;
65 float sum = 0;
66 for(const float* f=filter_start;f<filter_end;f++, src++)
68 sum += src[0] * f[0];
70 *dst2 = sum;
72 for (;dst2<dst_endr;dst2++, filter_end--)//right margin
74 const float *src = dst2;
75 float sum = 0;
76 for(const float* f=filter;f<filter_end;f++, src++)
78 sum += src[0] * f[0];
80 *dst2 = sum;
85 /****
86 * See @xy_filter_c
87 **/
88 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
90 #ifdef XY_FILTER_4
91 # undef XY_FILTER_4
92 #endif
93 ASSERT( stride>=4*(width+filter_width) );
94 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
96 int xx_fix = width > filter_width ? 0 : filter_width - width;
97 const float *filter_start = filter;
98 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
99 BYTE* end = dst_byte + height*stride;
100 for( ; dst_byte<end; dst_byte+=stride )
102 float *dst_f = reinterpret_cast<float*>(dst_byte);
103 float *dst2 = dst_f - filter_width;
104 float *dst_endr = dst_f + width;
105 float *dst_end0 = dst_endr - filter_width;
106 float *dst_endl = dst_f - xx_fix;
107 ASSERT(xx_fix==0 || dst_end0==dst_endl);
109 ASSERT(filter_start == filter);
110 filter_start += filter_width;
111 const float *filter_end = filter_start;
113 for (;dst2<dst_endl;dst2+=4)//left margin
115 const float *src = dst_f;
116 filter_start -= 4;
118 //filter 4
119 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
120 __m128 sum = _mm_setzero_ps();
121 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
123 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
124 __m128 f4 = _mm_load_ps(f);
126 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
127 __m128 f4_1 = _mm_unpacklo_ps(f4,f4);\
128 f4_1 = _mm_unpacklo_ps(f4_1, f4_1);\
129 f4_1 = _mm_mul_ps(f4_1, src4);\
130 sum = _mm_add_ps(sum, f4_1);\
131 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
132 f4_1 = _mm_unpackhi_ps(f4,f4);\
133 f4_1 = _mm_unpacklo_ps(f4_1,f4_1);\
134 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
135 sum = _mm_add_ps(sum, f4_1);\
136 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
137 f4_1 = _mm_unpacklo_ps(f4,f4);\
138 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
139 f4_1 = _mm_mul_ps(f4_1, src4);\
140 sum = _mm_add_ps(sum, f4_1);\
141 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
142 f4_1 = _mm_unpackhi_ps(f4,f4);\
143 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
144 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
145 sum = _mm_add_ps(sum, f4_1)
147 { XY_FILTER_4(src4, src_5_8, f4, sum); }
148 src4 = src_5_8;
150 //store result
151 _mm_stream_ps(dst2, sum);
153 for (;dst2<dst_f;dst2+=4)//if width < filter_width
155 const float *src = dst_f;
156 filter_start-=4;
157 filter_end-=4;
159 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
160 __m128 sum = _mm_setzero_ps();
161 __m128 src_5_8, f4;
162 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
164 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
165 f4 = _mm_load_ps(f);
167 { XY_FILTER_4(src4, src_5_8, f4, sum); }
168 src4 = src_5_8;
170 src_5_8 = _mm_setzero_ps();
171 f4 = _mm_load_ps(filter_end);
172 { XY_FILTER_4(src4, src_5_8, f4, sum); }
173 //store result
174 _mm_stream_ps(dst2, sum);
176 ASSERT(filter_start == filter);
177 for (;dst2<dst_end0;dst2+=4)
179 const float *src = dst2;
181 //filter 4
182 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
183 __m128 sum = _mm_setzero_ps();
184 for(const float* f=filter_start;f<filter_end;f+=4)
186 src+=4;
187 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
188 __m128 f4 = _mm_load_ps(f);
190 { XY_FILTER_4(src4, src_5_8, f4, sum); }
191 src4 = src_5_8;
193 //store result
194 _mm_stream_ps(dst2, sum);
196 for (;dst2<dst_endr;dst2+=4)//right margin
198 const float *src = dst2;
199 filter_end-=4;
201 //filter 4
202 __m128 src4 = _mm_load_ps(src);//1 2 3 4
203 __m128 sum = _mm_setzero_ps();
204 __m128 src_5_8, f4;
205 for(const float* f=filter_start;f<filter_end;f+=4)
207 src+=4;
208 src_5_8 = _mm_load_ps(src);//5 6 7 8
209 f4 = _mm_load_ps(f);
211 { XY_FILTER_4(src4, src_5_8, f4, sum); }
213 src4 = src_5_8;
214 //move new 4 in_n_out to old 4 in_n_out
216 src_5_8 = _mm_setzero_ps();
217 f4 = _mm_load_ps(filter_end);
218 { XY_FILTER_4(src4, src_5_8, f4, sum); }
219 //store result
220 _mm_stream_ps(dst2, sum);
223 #undef XY_FILTER_4
226 /****
227 * Copy and convert src to dst line by line.
228 * @dst_width MUST >= @width
229 * if @dst_width>@width, the extra elements will be filled with 0.
231 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
232 PCUINT8 src, int width, int height, int stride)
234 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
236 PCUINT8 src_end = src + height*stride;
237 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
239 PCUINT8 src2 = src;
240 PCUINT8 src_end = src2 + width;
241 float *dst2 = reinterpret_cast<float*>(dst_byte);
243 for (;src2<src_end;src2++, dst2++)
245 *dst2 = *src2;
247 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
248 for (;dst2<dst2_end;dst2++)
250 *dst2=0;
255 /****
256 * See @xy_byte_2_float_c
258 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
259 PCUINT8 src, int width, int height, int stride)
261 ASSERT( dst_width>=width );
262 ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
263 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
265 PCUINT8 src_end = src + height*stride;
266 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
268 PCUINT8 src2 = src;
269 PCUINT8 src2_end0 = src2 + (width&~15);
270 float *dst2 = reinterpret_cast<float*>(dst_byte);
272 for (;src2<src2_end0;src2+=16, dst2+=16)
274 //filter 4
275 __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
276 __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
277 __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
278 __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
279 __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
280 __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
281 __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
282 __m128 dst_f1 = _mm_cvtepi32_ps(src16_lo_lo);
283 __m128 dst_f2 = _mm_cvtepi32_ps(src16_lo_hi);
284 __m128 dst_f3 = _mm_cvtepi32_ps(src16_hi_lo);
285 __m128 dst_f4 = _mm_cvtepi32_ps(src16_hi_hi);
286 _mm_stream_ps(dst2, dst_f1);
287 _mm_stream_ps(dst2+4, dst_f2);
288 _mm_stream_ps(dst2+8, dst_f3);
289 _mm_stream_ps(dst2+12, dst_f4);
291 PCUINT8 src2_end = src + width;
292 for (;src2<src2_end;src2++,dst2++)
294 *dst2 = *src2;
296 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
297 for (;dst2<dst2_end;dst2++)
299 *dst2=0;
304 /****
305 * Copy transposed Matrix src to dst.
306 * @dst_width MUST >= @height.
307 * if @dst_width > @height, the extra elements will be filled with 0.
309 void xy_transpose(float *dst, int dst_width, int dst_stride,
310 const float *src, int width, int height, int src_stride)
312 ASSERT(dst_width >= height);
313 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
314 const float* src_end = src + width;
315 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
316 for( ; src<src_end; src++, dst_byte+=dst_stride )
318 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
320 float *dst2 = reinterpret_cast<float*>(dst_byte);
321 for (;src2<src2_end;src2+=src_stride,dst2++)
323 *dst2 = *reinterpret_cast<const float*>(src2);
325 float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
326 for (;dst2<dst2_end;dst2++)
328 *dst2 = 0;
333 /****
334 * Transpose and round Matrix src, then copy to dst.
335 * @dst_width MUST >= @height.
336 * if @dst_width > @height, the extra elements will be filled with 0.
338 void xy_transpose(UINT8 *dst, int dst_width, int dst_stride,
339 const float *src, int width, int height, int src_stride)
341 ASSERT(dst_width >= height);
342 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
343 const float* src_end = src + width;
344 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
345 for( ; src<src_end; src++, dst_byte+=dst_stride )
347 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
349 UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
350 for (;src2<src2_end;src2+=src_stride,dst2++)
352 *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
354 UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
355 for (;dst2<dst2_end;dst2++)
357 *dst2 = 0;
362 /****
363 * To Do: decent CPU capability check
365 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
366 PCUINT8 src, int width, int height, int stride,
367 const float *gt, int r, int gt_ex_width)
369 ASSERT(width<=stride && width+2*r<=dst_stride);
370 int ex_mask_width = ((r*2+1)+3)&~3;
371 ASSERT(ex_mask_width<=gt_ex_width);
372 if (ex_mask_width>gt_ex_width)
374 int o=0;
375 o=o/o;
376 exit(-1);
379 int fwidth = (width+3)&~3;
380 int fstride = (fwidth + ex_mask_width)*sizeof(float);
381 int fheight = (height+3)&~3;
382 int fstride_ver = (fheight+ex_mask_width)*sizeof(float);
384 PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width)*fstride_ver));
386 float *hor_buff_base = reinterpret_cast<float*>(buff_base);
387 float *hor_buff = hor_buff_base + ex_mask_width;
389 // byte to float
390 ASSERT( ((width+15)&~15)<=stride );
391 xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
393 // horizontal pass
394 xy_filter_sse(hor_buff, fwidth, height, fstride, gt, ex_mask_width);
397 // transpose
398 float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
399 float *ver_buff = ver_buff_base + ex_mask_width;
401 int true_width = width+r*2;
402 xy_transpose(ver_buff, fheight, fstride_ver, hor_buff-r*2, true_width, height, fstride);
404 // vertical pass
405 xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt, ex_mask_width);
407 // transpose
408 int true_height = height + 2*r;
409 xy_transpose(dst, true_width, dst_stride, ver_buff-r*2, true_height, true_width, fstride_ver);
411 xy_free(buff_base);
412 _mm_empty();