3 typedef const UINT8 CUINT8
, *PCUINT8
;
7 * 1. Source content starts from @dst;
8 * 2. Output content starts from @dst-@mwitdth;
11 * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
13 * |- margin -|- src width*height items -|
14 * |- mwidth*heigth items -|- -|
15 * |- do NOT need to init -|- -|
16 * |- when input -|- -|
17 * |---------------------------------------------------------|
19 void xy_filter_c(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
21 ASSERT( stride
>=4*(width
+filter_width
) );
23 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
24 const float *filter_start
= filter
;
26 BYTE
* end
= reinterpret_cast<BYTE
*>(dst
) + height
*stride
;
27 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
28 for( ; dst_byte
<end
; dst_byte
+=stride
)
30 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
31 float *dst2
= dst_f
- filter_width
;
32 float *dst_endr
= dst_f
+ width
;
33 float *dst_end0
= dst_endr
- filter_width
;
34 float *dst_endl
= dst_f
- xx_fix
;
35 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
37 ASSERT(filter_start
== filter
);
38 filter_start
+= filter_width
;
39 const float *filter_end
= filter_start
;
40 for (;dst2
<dst_endl
;dst2
++, filter_start
--)//left margin
42 const float *src
= dst_f
;
44 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
50 for (;dst2
<dst_f
;dst2
++, filter_start
--, filter_end
--)//if width < filter_width
52 const float *src
= dst_f
;
54 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
60 ASSERT(filter_start
==filter
);
61 for (;dst2
<dst_end0
;dst2
++)
63 const float *src
= dst2
;
66 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
72 for (;dst2
<dst_endr
;dst2
++, filter_end
--)//right margin
74 const float *src
= dst2
;
76 for(const float* f
=filter
;f
<filter_end
;f
++, src
++)
88 void xy_filter_sse(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
93 ASSERT( stride
>=4*(width
+filter_width
) );
94 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
96 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
97 const float *filter_start
= filter
;
98 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
99 BYTE
* end
= dst_byte
+ height
*stride
;
100 for( ; dst_byte
<end
; dst_byte
+=stride
)
102 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
103 float *dst2
= dst_f
- filter_width
;
104 float *dst_endr
= dst_f
+ width
;
105 float *dst_end0
= dst_endr
- filter_width
;
106 float *dst_endl
= dst_f
- xx_fix
;
107 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
109 ASSERT(filter_start
== filter
);
110 filter_start
+= filter_width
;
111 const float *filter_end
= filter_start
;
113 for (;dst2
<dst_endl
;dst2
+=4)//left margin
115 const float *src
= dst_f
;
119 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
120 __m128 sum
= _mm_setzero_ps();
121 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
123 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
124 __m128 f4
= _mm_load_ps(f
);
126 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
127 __m128 f4_1 = _mm_unpacklo_ps(f4,f4);\
128 f4_1 = _mm_unpacklo_ps(f4_1, f4_1);\
129 f4_1 = _mm_mul_ps(f4_1, src4);\
130 sum = _mm_add_ps(sum, f4_1);\
131 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
132 f4_1 = _mm_unpackhi_ps(f4,f4);\
133 f4_1 = _mm_unpacklo_ps(f4_1,f4_1);\
134 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
135 sum = _mm_add_ps(sum, f4_1);\
136 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
137 f4_1 = _mm_unpacklo_ps(f4,f4);\
138 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
139 f4_1 = _mm_mul_ps(f4_1, src4);\
140 sum = _mm_add_ps(sum, f4_1);\
141 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
142 f4_1 = _mm_unpackhi_ps(f4,f4);\
143 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
144 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
145 sum = _mm_add_ps(sum, f4_1)
147 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
151 _mm_stream_ps(dst2
, sum
);
153 for (;dst2
<dst_f
;dst2
+=4)//if width < filter_width
155 const float *src
= dst_f
;
159 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
160 __m128 sum
= _mm_setzero_ps();
162 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
164 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
167 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
170 src_5_8
= _mm_setzero_ps();
171 f4
= _mm_load_ps(filter_end
);
172 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
174 _mm_stream_ps(dst2
, sum
);
176 ASSERT(filter_start
== filter
);
177 for (;dst2
<dst_end0
;dst2
+=4)
179 const float *src
= dst2
;
182 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
183 __m128 sum
= _mm_setzero_ps();
184 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
187 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
188 __m128 f4
= _mm_load_ps(f
);
190 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
194 _mm_stream_ps(dst2
, sum
);
196 for (;dst2
<dst_endr
;dst2
+=4)//right margin
198 const float *src
= dst2
;
202 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
203 __m128 sum
= _mm_setzero_ps();
205 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
208 src_5_8
= _mm_load_ps(src
);//5 6 7 8
211 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
214 //move new 4 in_n_out to old 4 in_n_out
216 src_5_8
= _mm_setzero_ps();
217 f4
= _mm_load_ps(filter_end
);
218 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
220 _mm_stream_ps(dst2
, sum
);
227 * Copy and convert src to dst line by line.
228 * @dst_width MUST >= @width
229 * if @dst_width>@width, the extra elements will be filled with 0.
231 void xy_byte_2_float_c(float *dst
, int dst_width
, int dst_stride
,
232 PCUINT8 src
, int width
, int height
, int stride
)
234 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
236 PCUINT8 src_end
= src
+ height
*stride
;
237 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
240 PCUINT8 src_end
= src2
+ width
;
241 float *dst2
= reinterpret_cast<float*>(dst_byte
);
243 for (;src2
<src_end
;src2
++, dst2
++)
247 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
248 for (;dst2
<dst2_end
;dst2
++)
256 * See @xy_byte_2_float_c
258 void xy_byte_2_float_sse(float *dst
, int dst_width
, int dst_stride
,
259 PCUINT8 src
, int width
, int height
, int stride
)
261 ASSERT( dst_width
>=width
);
262 ASSERT( ((reinterpret_cast<int>(dst
)|dst_stride
)&15)==0 );
263 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
265 PCUINT8 src_end
= src
+ height
*stride
;
266 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
269 PCUINT8 src2_end0
= src2
+ (width
&~15);
270 float *dst2
= reinterpret_cast<float*>(dst_byte
);
272 for (;src2
<src2_end0
;src2
+=16, dst2
+=16)
275 __m128i src16
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(src2
));
276 __m128i src16_lo
= _mm_unpacklo_epi8(src16
, _mm_setzero_si128());
277 __m128i src16_hi
= _mm_unpackhi_epi8(src16
, _mm_setzero_si128());
278 __m128i src16_lo_lo
= _mm_unpacklo_epi8(src16_lo
, _mm_setzero_si128());
279 __m128i src16_lo_hi
= _mm_unpackhi_epi8(src16_lo
, _mm_setzero_si128());
280 __m128i src16_hi_lo
= _mm_unpacklo_epi8(src16_hi
, _mm_setzero_si128());
281 __m128i src16_hi_hi
= _mm_unpackhi_epi8(src16_hi
, _mm_setzero_si128());
282 __m128 dst_f1
= _mm_cvtepi32_ps(src16_lo_lo
);
283 __m128 dst_f2
= _mm_cvtepi32_ps(src16_lo_hi
);
284 __m128 dst_f3
= _mm_cvtepi32_ps(src16_hi_lo
);
285 __m128 dst_f4
= _mm_cvtepi32_ps(src16_hi_hi
);
286 _mm_stream_ps(dst2
, dst_f1
);
287 _mm_stream_ps(dst2
+4, dst_f2
);
288 _mm_stream_ps(dst2
+8, dst_f3
);
289 _mm_stream_ps(dst2
+12, dst_f4
);
291 PCUINT8 src2_end
= src
+ width
;
292 for (;src2
<src2_end
;src2
++,dst2
++)
296 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
297 for (;dst2
<dst2_end
;dst2
++)
305 * Copy transposed Matrix src to dst.
306 * @dst_width MUST >= @height.
307 * if @dst_width > @height, the extra elements will be filled with 0.
309 void xy_transpose(float *dst
, int dst_width
, int dst_stride
,
310 const float *src
, int width
, int height
, int src_stride
)
312 ASSERT(dst_width
>= height
);
313 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
314 const float* src_end
= src
+ width
;
315 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
316 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
318 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
320 float *dst2
= reinterpret_cast<float*>(dst_byte
);
321 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
323 *dst2
= *reinterpret_cast<const float*>(src2
);
325 float *dst2_end
= reinterpret_cast<float*>(dst_byte
) + dst_width
;
326 for (;dst2
<dst2_end
;dst2
++)
334 * Transpose and round Matrix src, then copy to dst.
335 * @dst_width MUST >= @height.
336 * if @dst_width > @height, the extra elements will be filled with 0.
338 void xy_transpose(UINT8
*dst
, int dst_width
, int dst_stride
,
339 const float *src
, int width
, int height
, int src_stride
)
341 ASSERT(dst_width
>= height
);
342 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
343 const float* src_end
= src
+ width
;
344 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
345 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
347 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
349 UINT8
*dst2
= reinterpret_cast<UINT8
*>(dst_byte
);
350 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
352 *dst2
= static_cast<UINT8
>(*reinterpret_cast<const float*>(src2
)+0.5);
354 UINT8
*dst2_end
= reinterpret_cast<UINT8
*>(dst_byte
) + dst_width
;
355 for (;dst2
<dst2_end
;dst2
++)
363 * To Do: decent CPU capability check
365 void xy_gaussian_blur(PUINT8 dst
, int dst_stride
,
366 PCUINT8 src
, int width
, int height
, int stride
,
367 const float *gt
, int r
, int gt_ex_width
)
369 ASSERT(width
<=stride
&& width
+2*r
<=dst_stride
);
370 int ex_mask_width
= ((r
*2+1)+3)&~3;
371 ASSERT(ex_mask_width
<=gt_ex_width
);
372 if (ex_mask_width
>gt_ex_width
)
379 int fwidth
= (width
+3)&~3;
380 int fstride
= (fwidth
+ ex_mask_width
)*sizeof(float);
381 int fheight
= (height
+3)&~3;
382 int fstride_ver
= (fheight
+ex_mask_width
)*sizeof(float);
384 PUINT8 buff_base
= reinterpret_cast<PUINT8
>(xy_malloc(height
*fstride
+ (fwidth
+ ex_mask_width
)*fstride_ver
));
386 float *hor_buff_base
= reinterpret_cast<float*>(buff_base
);
387 float *hor_buff
= hor_buff_base
+ ex_mask_width
;
390 ASSERT( ((width
+15)&~15)<=stride
);
391 xy_byte_2_float_sse(hor_buff
, fwidth
, fstride
, src
, width
, height
, stride
);
394 xy_filter_sse(hor_buff
, fwidth
, height
, fstride
, gt
, ex_mask_width
);
398 float *ver_buff_base
= reinterpret_cast<float*>(buff_base
+ height
*fstride
);
399 float *ver_buff
= ver_buff_base
+ ex_mask_width
;
401 int true_width
= width
+r
*2;
402 xy_transpose(ver_buff
, fheight
, fstride_ver
, hor_buff
-r
*2, true_width
, height
, fstride
);
405 xy_filter_sse(ver_buff
, fheight
, true_width
, fstride_ver
, gt
, ex_mask_width
);
408 int true_height
= height
+ 2*r
;
409 xy_transpose(dst
, true_width
, dst_stride
, ver_buff
-r
*2, true_height
, true_width
, fstride_ver
);