3 typedef const UINT8 CUINT8
, *PCUINT8
;
7 * 1. Source content starts from @dst;
8 * 2. Output content starts from @dst-@mwitdth;
11 * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
13 * |- margin -|- src width*height items -|
14 * |- mwidth*heigth items -|- -|
15 * |- do NOT need to init -|- -|
16 * |- when input -|- -|
17 * |---------------------------------------------------------|
19 void xy_filter_c(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
21 ASSERT( stride
>=4*(width
+filter_width
) );
23 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
24 const float *filter_start
= filter
;
26 BYTE
* end
= reinterpret_cast<BYTE
*>(dst
) + height
*stride
;
27 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
28 for( ; dst_byte
<end
; dst_byte
+=stride
)
30 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
31 float *dst2
= dst_f
- filter_width
;
32 float *dst_endr
= dst_f
+ width
;
33 float *dst_end0
= dst_endr
- filter_width
;
34 float *dst_endl
= dst_f
- xx_fix
;
35 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
37 ASSERT(filter_start
== filter
);
38 filter_start
+= filter_width
;
39 const float *filter_end
= filter_start
;
40 for (;dst2
<dst_endl
;dst2
++, filter_start
--)//left margin
42 const float *src
= dst_f
;
44 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
50 for (;dst2
<dst_f
;dst2
++, filter_start
--, filter_end
--)//if width < filter_width
52 const float *src
= dst_f
;
54 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
60 ASSERT(filter_start
==filter
);
61 for (;dst2
<dst_end0
;dst2
++)
63 const float *src
= dst2
;
66 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
72 for (;dst2
<dst_endr
;dst2
++, filter_end
--)//right margin
74 const float *src
= dst2
;
76 for(const float* f
=filter
;f
<filter_end
;f
++, src
++)
86 * inline function sometimes generates stupid code
88 * @src4, @src_5_8, @f4, @sum : __m128
89 * @src_5_8, @f4: const
93 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
94 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
95 f4_1 = _mm_mul_ps(f4_1, src4);\
96 sum = _mm_add_ps(sum, f4_1);\
97 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
98 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
99 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
100 sum = _mm_add_ps(sum, f4_1);\
101 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
102 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
103 f4_1 = _mm_mul_ps(f4_1, src4);\
104 sum = _mm_add_ps(sum, f4_1);\
105 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
106 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
107 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
108 sum = _mm_add_ps(sum, f4_1)
113 void xy_filter_sse(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
115 ASSERT( stride
>=4*(width
+filter_width
) );
116 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
118 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
119 const float *filter_start
= filter
;
120 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
121 BYTE
* end
= dst_byte
+ height
*stride
;
122 for( ; dst_byte
<end
; dst_byte
+=stride
)
124 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
125 float *dst2
= dst_f
- filter_width
;
126 float *dst_endr
= dst_f
+ width
;
127 float *dst_end0
= dst_endr
- filter_width
;
128 float *dst_endl
= dst_f
- xx_fix
;
129 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
131 ASSERT(filter_start
== filter
);
132 filter_start
+= filter_width
;
133 const float *filter_end
= filter_start
;
135 for (;dst2
<dst_endl
;dst2
+=4)//left margin
137 const float *src
= dst_f
;
141 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
142 __m128 sum
= _mm_setzero_ps();
143 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
145 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
146 __m128 f4
= _mm_load_ps(f
);
148 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
153 _mm_store_ps(dst2
, sum
);
155 for (;dst2
<dst_f
;dst2
+=4)//if width < filter_width
157 const float *src
= dst_f
;
161 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
162 __m128 sum
= _mm_setzero_ps();
164 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
166 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
169 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
172 src_5_8
= _mm_setzero_ps();
173 f4
= _mm_load_ps(filter_end
);
174 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
176 _mm_store_ps(dst2
, sum
);
178 ASSERT(filter_start
== filter
);
179 for (;dst2
<dst_end0
;dst2
+=4)
181 const float *src
= dst2
;
184 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
185 __m128 sum
= _mm_setzero_ps();
186 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
189 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
190 __m128 f4
= _mm_load_ps(f
);
192 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
196 _mm_store_ps(dst2
, sum
);
198 for (;dst2
<dst_endr
;dst2
+=4)//right margin
200 const float *src
= dst2
;
204 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
205 __m128 sum
= _mm_setzero_ps();
207 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
210 src_5_8
= _mm_load_ps(src
);//5 6 7 8
213 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
216 //move new 4 in_n_out to old 4 in_n_out
218 src_5_8
= _mm_setzero_ps();
219 f4
= _mm_load_ps(filter_end
);
220 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
222 _mm_store_ps(dst2
, sum
);
228 * Copy and convert src to dst line by line.
229 * @dst_width MUST >= @width
230 * if @dst_width>@width, the extra elements will be filled with 0.
232 void xy_byte_2_float_c(float *dst
, int dst_width
, int dst_stride
,
233 PCUINT8 src
, int width
, int height
, int stride
)
235 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
237 PCUINT8 src_end
= src
+ height
*stride
;
238 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
241 PCUINT8 src_end
= src2
+ width
;
242 float *dst2
= reinterpret_cast<float*>(dst_byte
);
244 for (;src2
<src_end
;src2
++, dst2
++)
248 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
249 for (;dst2
<dst2_end
;dst2
++)
257 * See @xy_byte_2_float_c
259 void xy_byte_2_float_sse(float *dst
, int dst_width
, int dst_stride
,
260 PCUINT8 src
, int width
, int height
, int stride
)
262 ASSERT( dst_width
>=width
);
263 ASSERT( ((reinterpret_cast<int>(dst
)|dst_stride
)&15)==0 );
264 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
266 PCUINT8 src_end
= src
+ height
*stride
;
267 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
270 PCUINT8 src2_end0
= src2
+ (width
&~15);
271 float *dst2
= reinterpret_cast<float*>(dst_byte
);
273 for (;src2
<src2_end0
;src2
+=16, dst2
+=16)
276 __m128i src16
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(src2
));
277 __m128i src16_lo
= _mm_unpacklo_epi8(src16
, _mm_setzero_si128());
278 __m128i src16_hi
= _mm_unpackhi_epi8(src16
, _mm_setzero_si128());
279 __m128i src16_lo_lo
= _mm_unpacklo_epi8(src16_lo
, _mm_setzero_si128());
280 __m128i src16_lo_hi
= _mm_unpackhi_epi8(src16_lo
, _mm_setzero_si128());
281 __m128i src16_hi_lo
= _mm_unpacklo_epi8(src16_hi
, _mm_setzero_si128());
282 __m128i src16_hi_hi
= _mm_unpackhi_epi8(src16_hi
, _mm_setzero_si128());
283 __m128 dst_f1
= _mm_cvtepi32_ps(src16_lo_lo
);
284 __m128 dst_f2
= _mm_cvtepi32_ps(src16_lo_hi
);
285 __m128 dst_f3
= _mm_cvtepi32_ps(src16_hi_lo
);
286 __m128 dst_f4
= _mm_cvtepi32_ps(src16_hi_hi
);
287 _mm_store_ps(dst2
, dst_f1
);
288 _mm_store_ps(dst2
+4, dst_f2
);
289 _mm_store_ps(dst2
+8, dst_f3
);
290 _mm_store_ps(dst2
+12, dst_f4
);
292 PCUINT8 src2_end
= src
+ width
;
293 for (;src2
<src2_end
;src2
++,dst2
++)
297 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
298 for (;dst2
<dst2_end
;dst2
++)
306 * Copy transposed Matrix src to dst.
307 * @dst_width MUST >= @height.
308 * if @dst_width > @height, the extra elements will be filled with 0.
310 void xy_float_2_float_transpose_c(float *dst
, int dst_width
, int dst_stride
,
311 const float *src
, int width
, int height
, int src_stride
)
313 ASSERT(dst_width
>= height
);
314 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
315 const float* src_end
= src
+ width
;
316 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
317 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
319 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
321 float *dst2
= reinterpret_cast<float*>(dst_byte
);
322 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
324 *dst2
= *reinterpret_cast<const float*>(src2
);
326 float *dst2_end
= reinterpret_cast<float*>(dst_byte
) + dst_width
;
327 for (;dst2
<dst2_end
;dst2
++)
335 * see @xy_float_2_float_transpose_c
337 void xy_float_2_float_transpose_sse(float *dst
, int dst_width
, int dst_stride
,
338 const float *src
, int width
, int height
, int src_stride
)
341 typedef const float SrcT
;
343 ASSERT( (((int)dst
|dst_stride
)&15)==0 );
344 ASSERT(dst_width
>= height
);
345 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
346 SrcT
* src_end
= src
+ width
;
347 PCUINT8 src2_end1
= reinterpret_cast<PCUINT8
>(src
) + (height
&~3)*src_stride
;
348 PCUINT8 src2_end2
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
349 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
351 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
353 DstT
*dst2
= reinterpret_cast<DstT
*>(dst_byte
);
354 for (;src2
<src2_end1
;src2
+=4*src_stride
,dst2
+=4)
356 __m128 m1
= _mm_set_ps(
357 *(SrcT
*)(src2
+3*src_stride
),
358 *(SrcT
*)(src2
+2*src_stride
),
359 *(SrcT
*)(src2
+src_stride
),
361 _mm_store_ps(dst2
, m1
);
363 for (;src2
<src2_end2
;src2
+=src_stride
,dst2
++)
365 *dst2
= *reinterpret_cast<SrcT
*>(src2
);
367 float *dst2_end
= reinterpret_cast<DstT
*>(dst_byte
) + dst_width
;
368 for (;dst2
<dst2_end
;dst2
++)
376 * Transpose and round Matrix src, then copy to dst.
377 * @dst_width MUST >= @height.
378 * if @dst_width > @height, the extra elements will be filled with 0.
380 void xy_float_2_byte_transpose_c(UINT8
*dst
, int dst_width
, int dst_stride
,
381 const float *src
, int width
, int height
, int src_stride
)
383 ASSERT(dst_width
>= height
);
384 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
385 const float* src_end
= src
+ width
;
386 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
387 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
389 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
391 UINT8
*dst2
= reinterpret_cast<UINT8
*>(dst_byte
);
392 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
394 *dst2
= static_cast<UINT8
>(*reinterpret_cast<const float*>(src2
)+0.5);
396 UINT8
*dst2_end
= reinterpret_cast<UINT8
*>(dst_byte
) + dst_width
;
397 for (;dst2
<dst2_end
;dst2
++)
404 void xy_float_2_byte_transpose_sse(UINT8
*dst
, int dst_width
, int dst_stride
,
405 const float *src
, int width
, int height
, int src_stride
)
408 typedef const float SrcT
;
410 ASSERT(dst_width
>= height
);
411 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
412 SrcT
* src_end
= src
+ width
;
413 PCUINT8 src2_end00
= reinterpret_cast<PCUINT8
>(src
) + (height
&~15)*src_stride
;
414 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
415 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
417 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
419 DstT
*dst2
= reinterpret_cast<DstT
*>(dst_byte
);
420 for (;src2
<src2_end00
;src2
+=16*src_stride
,dst2
+=16)
422 __m128 m1
= _mm_set_ps(
423 *(SrcT
*)(src2
+3*src_stride
),
424 *(SrcT
*)(src2
+2*src_stride
),
425 *(SrcT
*)(src2
+src_stride
),
427 __m128 m2
= _mm_set_ps(
428 *(SrcT
*)(src2
+7*src_stride
),
429 *(SrcT
*)(src2
+6*src_stride
),
430 *(SrcT
*)(src2
+5*src_stride
),
431 *(SrcT
*)(src2
+4*src_stride
));
432 __m128 m3
= _mm_set_ps(
433 *(SrcT
*)(src2
+11*src_stride
),
434 *(SrcT
*)(src2
+10*src_stride
),
435 *(SrcT
*)(src2
+9*src_stride
),
436 *(SrcT
*)(src2
+8*src_stride
));
437 __m128 m4
= _mm_set_ps(
438 *(SrcT
*)(src2
+15*src_stride
),
439 *(SrcT
*)(src2
+14*src_stride
),
440 *(SrcT
*)(src2
+13*src_stride
),
441 *(SrcT
*)(src2
+12*src_stride
));
443 __m128i i1
= _mm_cvtps_epi32(m1
);
444 __m128i i2
= _mm_cvtps_epi32(m2
);
445 __m128i i3
= _mm_cvtps_epi32(m3
);
446 __m128i i4
= _mm_cvtps_epi32(m4
);
448 i1
= _mm_packs_epi32(i1
,i2
);
449 i3
= _mm_packs_epi32(i3
,i4
);
450 i1
= _mm_packus_epi16(i1
,i3
);
452 _mm_store_si128((__m128i
*)dst2
, i1
);
454 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
456 *dst2
= static_cast<DstT
>(*reinterpret_cast<SrcT
*>(src2
)+0.5);
458 DstT
*dst2_end
= reinterpret_cast<DstT
*>(dst_byte
) + dst_width
;
459 for (;dst2
<dst2_end
;dst2
++)
467 * To Do: decent CPU capability check
469 void xy_gaussian_blur(PUINT8 dst
, int dst_stride
,
470 PCUINT8 src
, int width
, int height
, int stride
,
471 const float *gt
, int r
, int gt_ex_width
)
473 ASSERT(width
<=stride
&& width
+2*r
<=dst_stride
);
474 int ex_mask_width
= ((r
*2+1)+3)&~3;
475 ASSERT(ex_mask_width
<=gt_ex_width
);
476 if (ex_mask_width
>gt_ex_width
)
483 int fwidth
= (width
+3)&~3;
484 int fstride
= (fwidth
+ ex_mask_width
)*sizeof(float);
485 int fheight
= (height
+3)&~3;
486 int fstride_ver
= (fheight
+ex_mask_width
)*sizeof(float);
488 PUINT8 buff_base
= reinterpret_cast<PUINT8
>(xy_malloc(height
*fstride
+ (fwidth
+ ex_mask_width
)*fstride_ver
));
490 float *hor_buff_base
= reinterpret_cast<float*>(buff_base
);
491 float *hor_buff
= hor_buff_base
+ ex_mask_width
;
494 ASSERT( ((width
+15)&~15)<=stride
);
495 xy_byte_2_float_sse(hor_buff
, fwidth
, fstride
, src
, width
, height
, stride
);
498 xy_filter_sse(hor_buff
, fwidth
, height
, fstride
, gt
, ex_mask_width
);
502 float *ver_buff_base
= reinterpret_cast<float*>(buff_base
+ height
*fstride
);
503 float *ver_buff
= ver_buff_base
+ ex_mask_width
;
505 int true_width
= width
+r
*2;
506 xy_float_2_float_transpose_sse(ver_buff
, fheight
, fstride_ver
, hor_buff
-r
*2, true_width
, height
, fstride
);
509 xy_filter_sse(ver_buff
, fheight
, true_width
, fstride_ver
, gt
, ex_mask_width
);
512 int true_height
= height
+ 2*r
;
513 xy_float_2_byte_transpose_sse(dst
, true_width
, dst_stride
, ver_buff
-r
*2, true_height
, true_width
, fstride_ver
);