1 #ifndef __XY_FILTER_BENCHMARK_F0ED9F75_C0A7_4B0E_910E_7A82ECE2D7AD_H__
2 #define __XY_FILTER_BENCHMARK_F0ED9F75_C0A7_4B0E_910E_7A82ECE2D7AD_H__
4 #include <gtest/gtest.h>
10 typedef const UINT8 CUINT8
, *PCUINT8
;
12 //////////////////////////////////////////////////////////////////////
16 void xy_filter_c_v0(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
18 ASSERT( stride
>=4*(width
+filter_width
) );
20 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
21 const float *filter_start
= filter
;
23 BYTE
* end
= reinterpret_cast<BYTE
*>(dst
) + height
*stride
;
24 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
25 for( ; dst_byte
<end
; dst_byte
+=stride
)
27 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
28 float *dst2
= dst_f
- filter_width
;
29 float *dst_endr
= dst_f
+ width
;
30 float *dst_end0
= dst_endr
- filter_width
;
31 float *dst_endl
= dst_f
- xx_fix
;
32 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
34 ASSERT(filter_start
== filter
);
35 filter_start
+= filter_width
;
36 const float *filter_end
= filter_start
;
37 for (;dst2
<dst_endl
;dst2
++, filter_start
--)//left margin
39 const float *src
= dst_f
;
41 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
47 for (;dst2
<dst_f
;dst2
++, filter_start
--, filter_end
--)//if width < filter_width
49 const float *src
= dst_f
;
51 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
57 ASSERT(filter_start
==filter
);
58 for (;dst2
<dst_end0
;dst2
++)
60 const float *src
= dst2
;
63 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
69 for (;dst2
<dst_endr
;dst2
++, filter_end
--)//right margin
71 const float *src
= dst2
;
73 for(const float* f
=filter
;f
<filter_end
;f
++, src
++)
82 void xy_filter_sse_v0(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
87 ASSERT( stride
>=4*(width
+filter_width
) );
88 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
90 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
91 const float *filter_start
= filter
;
92 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
93 BYTE
* end
= dst_byte
+ height
*stride
;
94 for( ; dst_byte
<end
; dst_byte
+=stride
)
96 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
97 float *dst2
= dst_f
- filter_width
;
98 float *dst_endr
= dst_f
+ width
;
99 float *dst_end0
= dst_endr
- filter_width
;
100 float *dst_endl
= dst_f
- xx_fix
;
101 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
103 ASSERT(filter_start
== filter
);
104 filter_start
+= filter_width
;
105 const float *filter_end
= filter_start
;
107 for (;dst2
<dst_endl
;dst2
+=4)//left margin
109 const float *src
= dst_f
;
113 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
114 __m128 sum
= _mm_setzero_ps();
115 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
117 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
118 __m128 f4
= _mm_load_ps(f
);
120 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
121 __m128 f4_1 = _mm_unpacklo_ps(f4,f4);\
122 f4_1 = _mm_unpacklo_ps(f4_1, f4_1);\
123 f4_1 = _mm_mul_ps(f4_1, src4);\
124 sum = _mm_add_ps(sum, f4_1);\
125 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
126 f4_1 = _mm_unpackhi_ps(f4,f4);\
127 f4_1 = _mm_unpacklo_ps(f4_1,f4_1);\
128 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
129 sum = _mm_add_ps(sum, f4_1);\
130 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
131 f4_1 = _mm_unpacklo_ps(f4,f4);\
132 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
133 f4_1 = _mm_mul_ps(f4_1, src4);\
134 sum = _mm_add_ps(sum, f4_1);\
135 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
136 f4_1 = _mm_unpackhi_ps(f4,f4);\
137 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
138 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
139 sum = _mm_add_ps(sum, f4_1)
141 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
145 _mm_stream_ps(dst2
, sum
);
147 for (;dst2
<dst_f
;dst2
+=4)//if width < filter_width
149 const float *src
= dst_f
;
153 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
154 __m128 sum
= _mm_setzero_ps();
156 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
158 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
161 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
164 src_5_8
= _mm_setzero_ps();
165 f4
= _mm_load_ps(filter_end
);
166 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
168 _mm_stream_ps(dst2
, sum
);
170 ASSERT(filter_start
== filter
);
171 for (;dst2
<dst_end0
;dst2
+=4)
173 const float *src
= dst2
;
176 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
177 __m128 sum
= _mm_setzero_ps();
178 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
181 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
182 __m128 f4
= _mm_load_ps(f
);
184 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
188 _mm_stream_ps(dst2
, sum
);
190 for (;dst2
<dst_endr
;dst2
+=4)//right margin
192 const float *src
= dst2
;
196 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
197 __m128 sum
= _mm_setzero_ps();
199 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
202 src_5_8
= _mm_load_ps(src
);//5 6 7 8
205 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
208 //move new 4 in_n_out to old 4 in_n_out
210 src_5_8
= _mm_setzero_ps();
211 f4
= _mm_load_ps(filter_end
);
212 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
214 _mm_stream_ps(dst2
, sum
);
220 void xy_filter_sse_v1(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
225 ASSERT( stride
>=4*(width
+filter_width
) );
226 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
228 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
229 const float *filter_start
= filter
;
230 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
231 BYTE
* end
= dst_byte
+ height
*stride
;
232 for( ; dst_byte
<end
; dst_byte
+=stride
)
234 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
235 float *dst2
= dst_f
- filter_width
;
236 float *dst_endr
= dst_f
+ width
;
237 float *dst_end0
= dst_endr
- filter_width
;
238 float *dst_endl
= dst_f
- xx_fix
;
239 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
241 ASSERT(filter_start
== filter
);
242 filter_start
+= filter_width
;
243 const float *filter_end
= filter_start
;
245 for (;dst2
<dst_endl
;dst2
+=4)//left margin
247 const float *src
= dst_f
;
251 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
252 __m128 sum
= _mm_setzero_ps();
253 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
255 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
256 __m128 f4
= _mm_load_ps(f
);
258 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
259 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
260 f4_1 = _mm_mul_ps(f4_1, src4);\
261 sum = _mm_add_ps(sum, f4_1);\
262 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
263 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
264 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
265 sum = _mm_add_ps(sum, f4_1);\
266 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
267 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
268 f4_1 = _mm_mul_ps(f4_1, src4);\
269 sum = _mm_add_ps(sum, f4_1);\
270 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
271 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
272 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
273 sum = _mm_add_ps(sum, f4_1)
275 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
279 _mm_stream_ps(dst2
, sum
);
281 for (;dst2
<dst_f
;dst2
+=4)//if width < filter_width
283 const float *src
= dst_f
;
287 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
288 __m128 sum
= _mm_setzero_ps();
290 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
292 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
295 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
298 src_5_8
= _mm_setzero_ps();
299 f4
= _mm_load_ps(filter_end
);
300 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
302 _mm_stream_ps(dst2
, sum
);
304 ASSERT(filter_start
== filter
);
305 for (;dst2
<dst_end0
;dst2
+=4)
307 const float *src
= dst2
;
310 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
311 __m128 sum
= _mm_setzero_ps();
312 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
315 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
316 __m128 f4
= _mm_load_ps(f
);
318 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
322 _mm_stream_ps(dst2
, sum
);
324 for (;dst2
<dst_endr
;dst2
+=4)//right margin
326 const float *src
= dst2
;
330 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
331 __m128 sum
= _mm_setzero_ps();
333 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
336 src_5_8
= _mm_load_ps(src
);//5 6 7 8
339 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
342 //move new 4 in_n_out to old 4 in_n_out
344 src_5_8
= _mm_setzero_ps();
345 f4
= _mm_load_ps(filter_end
);
346 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
348 _mm_stream_ps(dst2
, sum
);
354 void xy_filter_sse_v2(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
359 ASSERT( stride
>=4*(width
+filter_width
) );
360 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
362 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
363 const float *filter_start
= filter
;
364 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
365 BYTE
* end
= dst_byte
+ height
*stride
;
366 for( ; dst_byte
<end
; dst_byte
+=stride
)
368 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
369 float *dst2
= dst_f
- filter_width
;
370 float *dst_endr
= dst_f
+ width
;
371 float *dst_end0
= dst_endr
- filter_width
;
372 float *dst_endl
= dst_f
- xx_fix
;
373 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
375 ASSERT(filter_start
== filter
);
376 filter_start
+= filter_width
;
377 const float *filter_end
= filter_start
;
379 for (;dst2
<dst_endl
;dst2
+=4)//left margin
381 const float *src
= dst_f
;
385 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
386 __m128 sum
= _mm_setzero_ps();
387 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
389 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
390 __m128 f4
= _mm_load_ps(f
);
392 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
393 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
394 f4_1 = _mm_mul_ps(f4_1, src4);\
395 sum = _mm_add_ps(sum, f4_1);\
396 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
397 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
398 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
399 sum = _mm_add_ps(sum, f4_1);\
400 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
401 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
402 f4_1 = _mm_mul_ps(f4_1, src4);\
403 sum = _mm_add_ps(sum, f4_1);\
404 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
405 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
406 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
407 sum = _mm_add_ps(sum, f4_1)
409 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
413 _mm_store_ps(dst2
, sum
);
415 for (;dst2
<dst_f
;dst2
+=4)//if width < filter_width
417 const float *src
= dst_f
;
421 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
422 __m128 sum
= _mm_setzero_ps();
424 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
426 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
429 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
432 src_5_8
= _mm_setzero_ps();
433 f4
= _mm_load_ps(filter_end
);
434 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
436 _mm_store_ps(dst2
, sum
);
438 ASSERT(filter_start
== filter
);
439 for (;dst2
<dst_end0
;dst2
+=4)
441 const float *src
= dst2
;
444 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
445 __m128 sum
= _mm_setzero_ps();
446 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
449 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
450 __m128 f4
= _mm_load_ps(f
);
452 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
456 _mm_store_ps(dst2
, sum
);
458 for (;dst2
<dst_endr
;dst2
+=4)//right margin
460 const float *src
= dst2
;
464 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
465 __m128 sum
= _mm_setzero_ps();
467 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
470 src_5_8
= _mm_load_ps(src
);//5 6 7 8
473 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
476 //move new 4 in_n_out to old 4 in_n_out
478 src_5_8
= _mm_setzero_ps();
479 f4
= _mm_load_ps(filter_end
);
480 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
482 _mm_store_ps(dst2
, sum
);
488 static __forceinline
void xy_filter_4_inline(__m128
& src4
, const __m128
& src_5_8
, const __m128
& f4
, __m128
& sum
)
490 __m128 f4_1
= _mm_shuffle_ps(f4
, f4
, _MM_SHUFFLE(0,0,0,0));
491 f4_1
= _mm_mul_ps(f4_1
, src4
);
492 sum
= _mm_add_ps(sum
, f4_1
);
493 __m128 src_3_6
= _mm_shuffle_ps(src4
, src_5_8
, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/
494 f4_1
= _mm_shuffle_ps(f4
, f4
, _MM_SHUFFLE(2,2,2,2));
495 f4_1
= _mm_mul_ps(f4_1
, src_3_6
);
496 sum
= _mm_add_ps(sum
, f4_1
);
497 src4
= _mm_shuffle_ps(src4
, src_3_6
, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/
498 f4_1
= _mm_shuffle_ps(f4
, f4
, _MM_SHUFFLE(1,1,1,1));
499 f4_1
= _mm_mul_ps(f4_1
, src4
);
500 sum
= _mm_add_ps(sum
, f4_1
);
501 src_3_6
= _mm_shuffle_ps(src_3_6
, src_5_8
, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/
502 f4_1
= _mm_shuffle_ps(f4
, f4
, _MM_SHUFFLE(3,3,3,3));
503 f4_1
= _mm_mul_ps(f4_1
, src_3_6
);
504 sum
= _mm_add_ps(sum
, f4_1
);
507 void xy_filter_sse_v3(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
509 ASSERT( stride
>=4*(width
+filter_width
) );
510 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
512 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
513 const float *filter_start
= filter
;
514 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
515 BYTE
* end
= dst_byte
+ height
*stride
;
516 for( ; dst_byte
<end
; dst_byte
+=stride
)
518 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
519 float *dst2
= dst_f
- filter_width
;
520 float *dst_endr
= dst_f
+ width
;
521 float *dst_end0
= dst_endr
- filter_width
;
522 float *dst_endl
= dst_f
- xx_fix
;
523 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
525 ASSERT(filter_start
== filter
);
526 filter_start
+= filter_width
;
527 const float *filter_end
= filter_start
;
529 for (;dst2
<dst_endl
;dst2
+=4)//left margin
531 const float *src
= dst_f
;
535 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
536 __m128 sum
= _mm_setzero_ps();
537 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
539 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
540 __m128 f4
= _mm_load_ps(f
);
542 xy_filter_4_inline(src4
, src_5_8
, f4
, sum
);
547 _mm_store_ps(dst2
, sum
);
549 for (;dst2
<dst_f
;dst2
+=4)//if width < filter_width
551 const float *src
= dst_f
;
555 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
556 __m128 sum
= _mm_setzero_ps();
558 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
560 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
563 xy_filter_4_inline(src4
, src_5_8
, f4
, sum
);
566 src_5_8
= _mm_setzero_ps();
567 f4
= _mm_load_ps(filter_end
);
568 xy_filter_4_inline(src4
, src_5_8
, f4
, sum
);
570 _mm_store_ps(dst2
, sum
);
572 ASSERT(filter_start
== filter
);
573 for (;dst2
<dst_end0
;dst2
+=4)
575 const float *src
= dst2
;
578 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
579 __m128 sum
= _mm_setzero_ps();
580 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
583 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
584 __m128 f4
= _mm_load_ps(f
);
586 xy_filter_4_inline(src4
, src_5_8
, f4
, sum
);
590 _mm_store_ps(dst2
, sum
);
592 for (;dst2
<dst_endr
;dst2
+=4)//right margin
594 const float *src
= dst2
;
598 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
599 __m128 sum
= _mm_setzero_ps();
601 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
604 src_5_8
= _mm_load_ps(src
);//5 6 7 8
607 xy_filter_4_inline(src4
, src_5_8
, f4
, sum
);
610 //move new 4 in_n_out to old 4 in_n_out
612 src_5_8
= _mm_setzero_ps();
613 f4
= _mm_load_ps(filter_end
);
614 xy_filter_4_inline(src4
, src_5_8
, f4
, sum
);
616 _mm_store_ps(dst2
, sum
);
622 * inline function sometimes generates stupid code
624 * @src4, @src_5_8, @f4, @sum : __m128
625 * @src_5_8, @f4: const
629 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
630 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
631 f4_1 = _mm_mul_ps(f4_1, src4);\
632 sum = _mm_add_ps(sum, f4_1);\
633 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
634 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
635 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
636 sum = _mm_add_ps(sum, f4_1);\
637 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
638 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
639 f4_1 = _mm_mul_ps(f4_1, src4);\
640 sum = _mm_add_ps(sum, f4_1);\
641 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
642 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
643 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
644 sum = _mm_add_ps(sum, f4_1)
647 __forceinline
void xy_filter_one_line_sse_v4(float *dst
, int width
, const float *filter
, int filter_width
)
649 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
650 const float *filter_start
= filter
;
651 float *dst2
= dst
- filter_width
;
652 float *dst_endr
= dst
+ width
;
653 float *dst_end0
= dst_endr
- filter_width
;
654 float *dst_endl
= dst
- xx_fix
;
655 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
657 ASSERT(filter_start
== filter
);
658 filter_start
+= filter_width
;
659 const float *filter_end
= filter_start
;
661 for (;dst2
<dst_endl
;dst2
+=4)//left margin
663 const float *src
= dst
;
667 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
668 __m128 sum
= _mm_setzero_ps();
669 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
671 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
672 __m128 f4
= _mm_load_ps(f
);
674 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
679 _mm_store_ps(dst2
, sum
);
681 for (;dst2
<dst
;dst2
+=4)//if width < filter_width
683 const float *src
= dst
;
687 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
688 __m128 sum
= _mm_setzero_ps();
690 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
692 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
695 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
698 src_5_8
= _mm_setzero_ps();
699 f4
= _mm_load_ps(filter_end
);
700 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
702 _mm_store_ps(dst2
, sum
);
704 ASSERT(filter_start
== filter
);
705 for (;dst2
<dst_end0
;dst2
+=4)
707 const float *src
= dst2
;
710 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
711 __m128 sum
= _mm_setzero_ps();
712 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
715 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
716 __m128 f4
= _mm_load_ps(f
);
718 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
722 _mm_store_ps(dst2
, sum
);
724 for (;dst2
<dst_endr
;dst2
+=4)//right margin
726 const float *src
= dst2
;
730 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
731 __m128 sum
= _mm_setzero_ps();
733 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
736 src_5_8
= _mm_load_ps(src
);//5 6 7 8
739 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
742 //move new 4 in_n_out to old 4 in_n_out
744 src_5_8
= _mm_setzero_ps();
745 f4
= _mm_load_ps(filter_end
);
746 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
748 _mm_store_ps(dst2
, sum
);
752 void xy_filter_sse_v4(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
754 ASSERT( stride
>=4*(width
+filter_width
) );
755 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
757 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
758 BYTE
* end
= dst_byte
+ height
*stride
;
759 for( ; dst_byte
<end
; dst_byte
+=stride
)
761 xy_filter_one_line_sse_v4(reinterpret_cast<float*>(dst_byte
), width
, filter
, filter_width
);
770 M128s
<LENGTH
- 4> next
;
772 template<int Index
> __forceinline __m128
& GetAt()
774 return next
.GetAt
<Index
- 4>();
776 template<> __forceinline __m128
& GetAt
<0>()
781 template<int Start
, int Offset
> __forceinline __m128
& GetAt()
783 return GetAt
<Start
+ Offset
>();
786 __forceinline
void Load(const float* src
)
788 x
= _mm_load_ps(src
);
796 void Load(const float* src
)
801 template<int FILTER_LENGTH
, int START
, int LENGTH
>
804 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
806 Filter4
<FILTER_LENGTH
,START
,LENGTH
-4>::do_cal(src0_128
, src4
, filter128s
, sum
);
807 __m128 src4_128
= _mm_load_ps(src4
+LENGTH
-4);
808 XY_FILTER_4(src0_128
, src4_128
, filter128s
.GetAt
<START
+LENGTH
-4>(), sum
);
813 template<int FILTER_LENGTH
, int START
>
814 struct Filter4
<FILTER_LENGTH
, START
, 0>
816 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
821 template<int FILTER_LENGTH
,int MARGIN_LENGTH
>
822 struct FilterAllMargin
824 static __forceinline
void cal_left_margin(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
827 __m128 src0
= _mm_setzero_ps();
828 __m128 sum
= _mm_setzero_ps();
829 Filter4
<FILTER_LENGTH
,MARGIN_LENGTH
-4,FILTER_LENGTH
-MARGIN_LENGTH
+4>::do_cal(src0
, src
, filter128s
, sum
);
830 _mm_store_ps(src
-MARGIN_LENGTH
, sum
);
831 FilterAllMargin
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::cal_left_margin(src
, filter128s
);
834 static __forceinline
void cal_right_margin(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
838 __m128 src0
= _mm_load_ps(src
);
839 __m128 sum
= _mm_setzero_ps();
840 Filter4
<FILTER_LENGTH
,0,MARGIN_LENGTH
-4>::do_cal(src0
, src
+4, filter128s
, sum
);
841 __m128 src4
= _mm_setzero_ps();
842 XY_FILTER_4(src0
, src4
, filter128s
.GetAt
<MARGIN_LENGTH
-4>(), sum
);
844 _mm_store_ps(src
, sum
);
846 FilterAllMargin
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::cal_right_margin(src
+4, filter128s
);
850 template<int FILTER_LENGTH
>
851 struct FilterAllMargin
<FILTER_LENGTH
,0>
853 static __forceinline
void cal_left_margin(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
856 static __forceinline
void cal_right_margin(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
864 * FILTER_LENGTH%4 == 0 && FILTER_LENGTH<=width
866 template<int FILTER_LENGTH
>
867 void xy_filter_sse_template_v0(float *dst
, int width
, int height
, int stride
, const float *filter
)
869 ASSERT( stride
>=4*(width
+FILTER_LENGTH
) );
870 ASSERT( ((stride
|(4*width
)|(4*FILTER_LENGTH
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
872 M128s
<FILTER_LENGTH
> filter128s
;
873 filter128s
.Load(filter
);
875 const float *filter_start
= filter
;
876 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
877 BYTE
* end
= dst_byte
+ height
*stride
;
878 for( ; dst_byte
<end
; dst_byte
+=stride
)
880 float *dst2
= reinterpret_cast<float*>(dst_byte
);
883 FilterAllMargin
<FILTER_LENGTH
,FILTER_LENGTH
>::cal_left_margin(dst2
, filter128s
);
884 float *dst_end1
= dst2
+ width
;
885 float *dst_end0
= dst_end1
- FILTER_LENGTH
;
886 for (;dst2
<dst_end0
;dst2
+=4)
888 const float *src
= dst2
;
891 __m128 src0
= _mm_load_ps(src
);/*1 2 3 4*/
893 __m128 sum
= _mm_setzero_ps();
894 Filter4
<FILTER_LENGTH
,0,FILTER_LENGTH
>::do_cal(src0
, src
, filter128s
, sum
);
896 _mm_store_ps(dst2
, sum
);
898 FilterAllMargin
<FILTER_LENGTH
,FILTER_LENGTH
>::cal_right_margin(dst2
, filter128s
);
902 void xy_filter_sse_v5(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
904 typedef void (*Filter
)(float *dst
, int width
, int height
, int stride
, const float *filter
);
905 const Filter filters
[] = { xy_filter_sse_template_v0
<4>, xy_filter_sse_template_v0
<8>, xy_filter_sse_template_v0
<12>, xy_filter_sse_template_v0
<16>,
906 xy_filter_sse_template_v0
<20>, xy_filter_sse_template_v0
<24>, xy_filter_sse_template_v0
<28>
908 if (filter_width
<=28 && filter_width
<=width
)
910 ASSERT(filter_width
%4==0);
911 filters
[(filter_width
-1)/4](dst
, width
, height
, stride
, filter
);
915 xy_filter_sse_v4(dst
, width
, height
, stride
, filter
, filter_width
);
921 __forceinline
void xy_filter_one_line_sse_v6(float *dst
, int width
, const float *filter
, int filter_width
)
923 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
924 const float *filter_start
= filter
;
925 float *dst2
= dst
- filter_width
;
926 float *dst_endr
= dst
+ width
;
927 float *dst_end0
= dst_endr
- filter_width
- 4;
928 float *dst_endl
= dst
- xx_fix
;
929 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
-4);
931 ASSERT(filter_start
== filter
);
932 filter_start
+= filter_width
;
933 const float *filter_end
= filter_start
;
935 for (;dst2
<dst_endl
;dst2
+=4)//left margin
937 const float *src
= dst
;
941 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
942 __m128 sum
= _mm_setzero_ps();
943 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
945 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
946 __m128 f4
= _mm_load_ps(f
);
948 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
953 _mm_store_ps(dst2
, sum
);
955 for (;dst2
<dst
;dst2
+=4)//if width < filter_width
957 const float *src
= dst
;
961 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
962 __m128 sum
= _mm_setzero_ps();
964 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
966 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
969 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
972 src_5_8
= _mm_setzero_ps();
973 f4
= _mm_load_ps(filter_end
);
974 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
976 _mm_store_ps(dst2
, sum
);
978 ASSERT(filter_start
== filter
);
979 for (;dst2
<dst_end0
;dst2
+=8)
981 const float *src
= dst2
;
982 const float* f
=filter_start
;
985 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
988 __m128 sum
= _mm_setzero_ps();
989 __m128 sum2
= _mm_setzero_ps();
990 __m128 f4
= _mm_load_ps(f
);
992 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
994 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
995 for(;f
<filter_end
;f
+=4,src
+=4)
997 src4
= _mm_load_ps(src
);/*1 2 3 4*/
998 __m128 tmp
= src_5_8
;//important!
999 { XY_FILTER_4(tmp
, src4
, f4
, sum2
); }
1001 f4
= _mm_load_ps(f
);
1002 { XY_FILTER_4(src_5_8
, src4
, f4
, sum
); }
1005 src4
= _mm_load_ps(src
);/*1 2 3 4*/
1006 { XY_FILTER_4(src_5_8
, src4
, f4
, sum2
); }
1009 _mm_store_ps(dst2
, sum
);
1010 _mm_store_ps(dst2
+4, sum2
);
1014 const float *src
= dst2
;
1016 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
1018 __m128 sum
= _mm_setzero_ps();
1019 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
1021 __m128 src_5_8
= _mm_load_ps(src
);//5 6 7 8
1022 __m128 f4
= _mm_load_ps(f
);
1024 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
1028 _mm_store_ps(dst2
, sum
);
1031 for (;dst2
<dst_endr
;dst2
+=4)//right margin
1033 const float *src
= dst2
;
1037 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
1038 __m128 sum
= _mm_setzero_ps();
1040 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
1043 src_5_8
= _mm_load_ps(src
);//5 6 7 8
1044 f4
= _mm_load_ps(f
);
1046 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
1049 //move new 4 in_n_out to old 4 in_n_out
1051 src_5_8
= _mm_setzero_ps();
1052 f4
= _mm_load_ps(filter_end
);
1053 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
1055 _mm_store_ps(dst2
, sum
);
1059 void xy_filter_sse_v6(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
1061 ASSERT( stride
>=4*(width
+filter_width
) );
1062 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
1064 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
1065 BYTE
* end
= dst_byte
+ height
*stride
;
1066 for( ; dst_byte
<end
; dst_byte
+=stride
)
1068 xy_filter_one_line_sse_v6(reinterpret_cast<float*>(dst_byte
), width
, filter
, filter_width
);
1075 // ref: "Comparing floating point numbers" by Bruce Dawson
1076 // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
1078 bool AlmostEqual(float A
, float B
, int maxUlps
=0)
1080 // Make sure maxUlps is non-negative and small enough that the
1081 // default NAN won't compare as equal to anything.
1082 ASSERT(maxUlps
> 0 && maxUlps
< 4 * 1024 * 1024);
1083 int aInt
= *(int*)&A
;
1084 // Make aInt lexicographically ordered as a twos-complement int
1086 aInt
= 0x80000000 - aInt
;
1087 // Make bInt lexicographically ordered as a twos-complement int
1088 int bInt
= *(int*)&B
;
1090 bInt
= 0x80000000 - bInt
;
1092 int intDiff
= abs(aInt
- bInt
);
1093 if (intDiff
<= maxUlps
)
1100 * @src4, @f4_1, @sum : __m128
1105 #define XY_FILTER_4_1(src4, f4_1, sum) \
1106 src4 = _mm_mul_ps(src4, f4_1);\
1107 sum = _mm_add_ps(sum, src4);
1111 * LENGTH%4 == 0 || LENGTH%4 == 1
1113 template<int LENGTH
>
1117 M128s_V1
<LENGTH
- 4> next
;
1119 template<int Index
> __forceinline __m128
& GetAt()
1121 return next
.GetAt
<Index
- 4>();
1123 template<> __forceinline __m128
& GetAt
<0>()
1128 template<int Start
, int Offset
> __forceinline __m128
& GetAt()
1130 return GetAt
<Start
+ Offset
>();
1133 __forceinline
void Load(const float* src
)
1135 x
= _mm_load_ps(src
);
1145 template<int Index
> __forceinline __m128
& GetAt()
1149 __forceinline
void Load(const float* src
)
1151 x
= _mm_set1_ps(*src
);
1158 void Load(const float* src
)
1163 template<int FILTER_LENGTH
, int START
, int LENGTH
>
1166 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s_V1
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
1168 __m128 src4_128
= _mm_load_ps(src4
);
1169 { XY_FILTER_4(src0_128
, src4_128
, filter128s
.GetAt
<START
>(), sum
); }
1170 Filter4_V1
<FILTER_LENGTH
,START
+4,LENGTH
-4>::do_cal(src4_128
, src4
+4, filter128s
, sum
);
1171 src0_128
= src4_128
;
1175 template<int FILTER_LENGTH
, int START
>
1176 struct Filter4_V1
<FILTER_LENGTH
, START
, 1>
1178 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s_V1
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
1180 cal_tail
<FILTER_LENGTH
-START
>(src0_128
, src4
, filter128s
, sum
);
1183 static __forceinline
void cal_tail(__m128
& src0_128
, const float * src4
, M128s_V1
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
1188 static __forceinline
void cal_tail
<1>(__m128
& src0_128
, const float * src4
, M128s_V1
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
1190 { XY_FILTER_4_1(src0_128
, filter128s
.GetAt
<FILTER_LENGTH
-1>(), sum
); }
1194 template<int FILTER_LENGTH
, int START
>
1195 struct Filter4_V1
<FILTER_LENGTH
, START
, 0>
1197 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s_V1
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
1202 template<int FILTER_LENGTH
,int MARGIN_LENGTH
>
1203 struct FilterAllLeftMargin_V1
1205 static __forceinline
void cal(float * src
, M128s_V1
<FILTER_LENGTH
>& filter128s
)
1207 do_cal
<FILTER_LENGTH
%4>(src
, filter128s
);
1209 template<int FILTER_TAIL
>
1210 static __forceinline
void do_cal(float * src
, M128s_V1
<FILTER_LENGTH
>& filter128s
)
1213 __m128 src0
= _mm_setzero_ps();
1214 __m128 sum
= _mm_setzero_ps();
1215 Filter4_V1
<FILTER_LENGTH
,MARGIN_LENGTH
-4,FILTER_LENGTH
-MARGIN_LENGTH
+4>::do_cal(src0
, src
, filter128s
, sum
);
1216 _mm_store_ps(src
-MARGIN_LENGTH
, sum
);
1217 FilterAllLeftMargin_V1
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::do_cal
<0>(src
, filter128s
);
1220 static __forceinline
void do_cal
<1>(float * src
, M128s_V1
<FILTER_LENGTH
>& filter128s
)
1223 __m128 sum
= _mm_setzero_ps();
1224 //Only one of the last 4 filter coefficiences is non-zero
1225 _mm_store_ps(src
-MARGIN_LENGTH
, sum
);
1226 FilterAllLeftMargin_V1
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::do_cal
<0>(src
, filter128s
);
1230 template<int FILTER_LENGTH
, int MARGIN_LENGTH
>
1231 struct FilterAllRightMargin_V1
1233 static __forceinline
void cal(float * src
, M128s_V1
<FILTER_LENGTH
>& filter128s
)
1235 do_cal
<FILTER_LENGTH
%4>(src
, filter128s
);
1237 template<int FILTER_TAIL
>
1238 static __forceinline
void do_cal(float * src
, M128s_V1
<FILTER_LENGTH
>& filter128s
)
1242 __m128 src0
= _mm_load_ps(src
);
1243 __m128 sum
= _mm_setzero_ps();
1244 Filter4_V1
<FILTER_LENGTH
,0,MARGIN_LENGTH
-4>::do_cal(src0
, src
+4, filter128s
, sum
);
1245 __m128 src4
= _mm_setzero_ps();
1246 { XY_FILTER_4(src0
, src4
, filter128s
.GetAt
<MARGIN_LENGTH
-4>(), sum
); }
1248 _mm_store_ps(src
, sum
);
1250 FilterAllRightMargin_V1
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::do_cal
<0>(src
+4, filter128s
);
1253 static __forceinline
void do_cal
<1>(float * src
, M128s_V1
<FILTER_LENGTH
>& filter128s
)
1257 __m128 src0
= _mm_load_ps(src
);
1258 __m128 sum
= _mm_setzero_ps();
1259 Filter4_V1
<FILTER_LENGTH
,0,MARGIN_LENGTH
-4>::do_cal(src0
, src
+4, filter128s
, sum
);
1260 //Only one of the last 4 filter coefficiences is non-zero
1261 { XY_FILTER_4_1(src0
, filter128s
.GetAt
<MARGIN_LENGTH
-4>(), sum
); }
1263 _mm_store_ps(src
, sum
);
1265 FilterAllRightMargin_V1
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::do_cal
<0>(src
+4, filter128s
);
1269 template<int FILTER_LENGTH
>
1270 struct FilterAllLeftMargin_V1
<FILTER_LENGTH
,0>
1272 template<int FILTER_TAIL
>
1273 static __forceinline
void do_cal(float * src
, M128s_V1
<FILTER_LENGTH
>& filter128s
)
1278 template<int FILTER_LENGTH
>
1279 struct FilterAllRightMargin_V1
<FILTER_LENGTH
,0>
1281 template<int FILTER_TAIL
>
1282 static __forceinline
void do_cal(float * src
, M128s_V1
<FILTER_LENGTH
>& filter128s
)
1289 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 );
1292 * FILTER_LENGTH<=width && width%4==0
1294 template<int FILTER_LENGTH
>
1295 void xy_filter_sse_template_v1(float *dst
, int width
, int height
, int stride
, const float *filter
)
1297 ASSERT( stride
>=4*(width
+FILTER_LENGTH
) );
1298 ASSERT( ((stride
|(4*width
)|(4*FILTER_LENGTH
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
1300 M128s_V1
<FILTER_LENGTH
> filter128s
;
1301 filter128s
.Load(filter
);
1303 const float *filter_start
= filter
;
1304 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
1305 BYTE
* end
= dst_byte
+ height
*stride
;
1306 for( ; dst_byte
<end
; dst_byte
+=stride
)
1308 float *dst2
= reinterpret_cast<float*>(dst_byte
);
1311 FilterAllLeftMargin_V1
<FILTER_LENGTH
,((FILTER_LENGTH
+3)&~3)>::cal(dst2
, filter128s
);
1312 float *dst_end1
= dst2
+ width
;
1313 float *dst_end0
= dst_end1
- ((FILTER_LENGTH
+3)&~3);
1314 for (;dst2
<dst_end0
;dst2
+=4)
1316 const float *src
= dst2
;
1319 __m128 src0
= _mm_load_ps(src
);/*1 2 3 4*/
1321 __m128 sum
= _mm_setzero_ps();
1322 Filter4_V1
<FILTER_LENGTH
,0,FILTER_LENGTH
>::do_cal(src0
, src
, filter128s
, sum
);
1324 _mm_store_ps(dst2
, sum
);
1326 FilterAllRightMargin_V1
<FILTER_LENGTH
,((FILTER_LENGTH
+3)&~3)>::cal(dst2
, filter128s
);
1333 void xy_filter_sse_v7(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
1335 typedef void (*Filter
)(float *dst
, int width
, int height
, int stride
, const float *filter
);
1336 const Filter filters
[] = {
1338 xy_filter_sse_template_v1
<1>, xy_filter_sse_template_v1
<4>, xy_filter_sse_template_v1
<4>, xy_filter_sse_template_v1
<4>,
1339 xy_filter_sse_template_v1
<5>, xy_filter_sse_template_v1
<8>, xy_filter_sse_template_v1
<8>, xy_filter_sse_template_v1
<8>,
1340 xy_filter_sse_template_v1
<9>, xy_filter_sse_template_v1
<12>, xy_filter_sse_template_v1
<12>, xy_filter_sse_template_v1
<12>,
1341 xy_filter_sse_template_v1
<13>, xy_filter_sse_template_v1
<16>, xy_filter_sse_template_v1
<16>, xy_filter_sse_template_v1
<16>,
1342 xy_filter_sse_template_v1
<17>, xy_filter_sse_template_v1
<20>, xy_filter_sse_template_v1
<20>, xy_filter_sse_template_v1
<20>,
1343 xy_filter_sse_template_v1
<21>, xy_filter_sse_template_v1
<24>, xy_filter_sse_template_v1
<24>, xy_filter_sse_template_v1
<24>,
1344 xy_filter_sse_template_v1
<25>, xy_filter_sse_template_v1
<28>, xy_filter_sse_template_v1
<28>, xy_filter_sse_template_v1
<28>
1346 if (filter_width
<=28 && filter_width
<=width
)
1348 int tmp
= filter_width
;
1349 while( AlmostEqual(filter
[tmp
-1],0.0f
) && filter_width
-tmp
<3)//Skip tail zero, but limited to 3. We cannot support more current.
1351 filters
[tmp
](dst
, width
, height
, stride
, filter
);
1355 xy_filter_sse_v6(dst
, width
, height
, stride
, filter
, filter_width
);
1365 * @src4, @src_5_8, @f3_1, @f3_2, @sum: __m128
1366 * @src4, @src_5_8, @f3_1, @f3_2: const
1369 #define XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum) \
1370 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
1371 __m128 src_2_5 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
1372 sum = _mm_mul_ps(f3_1, src4);\
1373 __m128 mul2 = _mm_mul_ps(f3_2, src_2_5);\
1374 __m128 mul3 = _mm_mul_ps(f3_1, src_3_6);\
1375 sum = _mm_add_ps(sum, mul2);\
1376 sum = _mm_add_ps(sum, mul3);
1380 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, 4 );
1383 * filter[3] == 0 && filter[0] == filter[2] (symmetric) (&& sum(filter)==1)
1385 void xy_3_tag_symmetric_filter_sse(float *dst
, int width
, int height
, int stride
, const float *filter
)
1387 const int filter_width
= 4;
1388 ASSERT( stride
>=4*(width
+filter_width
) );
1389 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
1391 ASSERT(filter_width
==4 && filter
[3]==0 && filter
[2]==filter
[0]);
1393 __m128 f3_1
= _mm_set1_ps(filter
[0]);
1394 __m128 f3_2
= _mm_set1_ps(filter
[1]);
1396 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
1397 BYTE
* end
= dst_byte
+ height
*stride
;
1398 for( ; dst_byte
<end
; dst_byte
+=stride
)
1400 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
1401 float *dst2
= dst_f
;
1403 float *dst_end0
= dst_f
+ width
- 4;
1405 __m128 src4
= _mm_load_ps(dst2
);/*1 2 3 4*/
1408 __m128 src_4
= _mm_setzero_ps();
1409 { XY_3_TAG_SYMMETRIC_FILTER_4(src_4
, src4
, f3_1
, f3_2
, sum
); }
1410 _mm_store_ps(dst2
, sum
);
1412 for (;dst2
<dst_end0
;dst2
+=4)
1415 __m128 src_5_8
= _mm_load_ps(dst2
+4);/*5 6 7 8*/
1416 { XY_3_TAG_SYMMETRIC_FILTER_4(src4
, src_5_8
, f3_1
, f3_2
, sum
); }
1419 _mm_store_ps(dst2
, sum
);
1423 __m128 src_5_8
= _mm_setzero_ps();/*5 6 7 8*/
1424 { XY_3_TAG_SYMMETRIC_FILTER_4(src4
, src_5_8
, f3_1
, f3_2
, sum
); }
1427 _mm_store_ps(dst2
, sum
);
1432 void xy_filter_sse_v8(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
1434 typedef void (*Filter
)(float *dst
, int width
, int height
, int stride
, const float *filter
);
1435 const Filter filters
[] = {
1437 xy_filter_sse_template_v1
<1>, xy_filter_sse_template_v1
<4>, xy_filter_sse_template_v1
<4>, xy_filter_sse_template_v1
<4>,
1438 xy_filter_sse_template_v1
<5>, xy_filter_sse_template_v1
<8>, xy_filter_sse_template_v1
<8>, xy_filter_sse_template_v1
<8>,
1439 xy_filter_sse_template_v1
<9>, xy_filter_sse_template_v1
<12>, xy_filter_sse_template_v1
<12>, xy_filter_sse_template_v1
<12>,
1440 xy_filter_sse_template_v1
<13>, xy_filter_sse_template_v1
<16>, xy_filter_sse_template_v1
<16>, xy_filter_sse_template_v1
<16>,
1441 xy_filter_sse_template_v1
<17>, xy_filter_sse_template_v1
<20>, xy_filter_sse_template_v1
<20>, xy_filter_sse_template_v1
<20>,
1442 xy_filter_sse_template_v1
<21>, xy_filter_sse_template_v1
<24>, xy_filter_sse_template_v1
<24>, xy_filter_sse_template_v1
<24>,
1443 xy_filter_sse_template_v1
<25>, xy_filter_sse_template_v1
<28>, xy_filter_sse_template_v1
<28>, xy_filter_sse_template_v1
<28>
1445 if (filter_width
<=28 && filter_width
<=width
)
1447 int tmp
= filter_width
;
1448 // Skip tail zero, but we cannot (and don't have to) support more than 3 tail zeros currently.
1449 while( AlmostEqual(filter
[tmp
-1],0.0f
) && filter_width
-tmp
<3 )
1451 if (tmp
==3&&filter
[0]==filter
[2])
1453 xy_3_tag_symmetric_filter_sse(dst
, width
, height
, stride
, filter
);
1457 filters
[tmp
](dst
, width
, height
, stride
, filter
);
1462 xy_filter_sse_v6(dst
, width
, height
, stride
, filter
, filter_width
);
1468 class XyFilterTest
: public ::testing::Test
1471 static const int MAX_FILTER_LENGTH
= 256;
1472 static const int MAX_DATA_BUFF_SIZE
= 512*512;
1473 static const int MAX_BUFF_SIZE
= MAX_DATA_BUFF_SIZE
+ 2*MAX_FILTER_LENGTH
;
1482 int ex_filter_width
;
1484 XyFilterTest():buff_base(NULL
),filter_f(NULL
),data(NULL
)
1486 buff_base
= (float*)xy_malloc(MAX_BUFF_SIZE
*sizeof(float));
1487 data
= buff_base
+ MAX_FILTER_LENGTH
;
1488 filter_f
= data
+ MAX_DATA_BUFF_SIZE
;
1492 xy_free(buff_base
);buff_base
=NULL
;
1495 const XyFilterTest
& copy (const XyFilterTest
& rhs
)
1500 filter_width
= rhs
.filter_width
;
1501 ex_filter_width
= rhs
.ex_filter_width
;
1503 memcpy(buff_base
, rhs
.buff_base
, MAX_BUFF_SIZE
*sizeof(float));
1507 void FillRandData(int w
, int h
, int filter_width
)
1511 this->filter_width
= filter_width
;
1512 this->ex_filter_width
= ((filter_width
+3)&~3);
1513 this->pitch
= ((w
+ex_filter_width
+3)&~3)*sizeof(float);
1515 ASSERT(this->pitch
*this->h
<= MAX_DATA_BUFF_SIZE
);
1517 for (int i
=0;i
<h
;i
++)
1519 float *data2
= (float*)((PUINT8
)data
+ i
*pitch
);
1520 for (int j
=0;j
<w
+filter_width
;j
++)
1522 data2
[j
] = rand()&0xFF;
1526 for (int i
=0;i
<filter_width
/2+1;i
++)
1528 filter_f
[i
] = abs((rand()&0xFFFFF)+0xFFFFF);
1531 for (int i
=filter_width
/2+1;i
<filter_width
;i
++)
1533 filter_f
[i
] = filter_f
[filter_width
-i
-1];
1536 for (int i
=0;i
<filter_width
;i
++)
1540 for (int i
=filter_width
;i
<ex_filter_width
;i
++)
1546 virtual void SetUp()
1550 XyFilterTest(const XyFilterTest
&);
1551 const XyFilterTest
& operator= (const XyFilterTest
& rhs
);
1554 #define FilterTest(width, height, FILTER_LENGTH, loop_num, function) \
1555 TEST_F(XyFilterTest, function ## _ ## width ## _ ## height ## _ ## FILTER_LENGTH ## _ ## loop_num ) \
1557 FillRandData(width, height, FILTER_LENGTH);\
1558 for(int i=0;i<loop_num;i++)\
1560 function(data, w, h, pitch, filter_f, (FILTER_LENGTH+3)&~3);\
1562 /*ASSERT_EQ(data[-filter_width],0)*/;/*anit compiler optimize*/\
1566 #define DUAL_TEST(width, height, FILTER_LENGTH, loop_num) \
1567 FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v7) \
1568 FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v8)
1570 DUAL_TEST(128, 16, 3, 30000)
1571 DUAL_TEST(256, 16, 3, 30000)
1572 DUAL_TEST(512, 16, 3, 30000)
1573 DUAL_TEST(128, 32, 3, 30000)
1574 DUAL_TEST(256, 32, 3, 30000)
1575 DUAL_TEST(512, 32, 3, 30000)
1576 DUAL_TEST(128, 64, 3, 30000)
1577 DUAL_TEST(256, 64, 3, 30000)
1578 DUAL_TEST(512, 64, 3, 30000)
1579 DUAL_TEST(128, 16, 20, 20000)
1580 DUAL_TEST(256, 16, 20, 20000)
1581 DUAL_TEST(512, 16, 20, 20000)
1582 DUAL_TEST(128, 32, 20, 20000)
1583 DUAL_TEST(256, 32, 20, 20000)
1584 DUAL_TEST(512, 32, 20, 20000)
1585 DUAL_TEST(128, 64, 20, 20000)
1586 DUAL_TEST(256, 64, 20, 20000)
1587 DUAL_TEST(512, 64, 20, 20000)
1588 DUAL_TEST(128, 16, 21, 20000)
1589 DUAL_TEST(256, 16, 21, 20000)
1590 DUAL_TEST(512, 16, 21, 20000)
1591 DUAL_TEST(128, 32, 21, 20000)
1592 DUAL_TEST(256, 32, 21, 20000)
1593 DUAL_TEST(512, 32, 21, 20000)
1594 DUAL_TEST(128, 64, 21, 20000)
1595 DUAL_TEST(256, 64, 21, 20000)
1596 DUAL_TEST(512, 64, 21, 20000)
1598 //////////////////////////////////////////////////////////////////////
1602 void xy_float_2_float_transpose_c_v0(float *dst
, int dst_width
, int dst_stride
,
1603 const float *src
, int width
, int height
, int src_stride
)
1605 ASSERT(dst_width
>= height
);
1606 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
1607 const float* src_end
= src
+ width
;
1608 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
1609 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
1611 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
1613 float *dst2
= reinterpret_cast<float*>(dst_byte
);
1614 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
1616 *dst2
= *reinterpret_cast<const float*>(src2
);
1618 float *dst2_end
= reinterpret_cast<float*>(dst_byte
) + dst_width
;
1619 for (;dst2
<dst2_end
;dst2
++)
1626 void xy_float_2_float_transpose_sse_v0(float *dst
, int dst_width
, int dst_stride
,
1627 const float *src
, int width
, int height
, int src_stride
)
1630 typedef const float SrcT
;
1632 ASSERT( (((int)dst
|dst_stride
)&15)==0 );
1633 ASSERT(dst_width
>= height
);
1634 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
1635 SrcT
* src_end
= src
+ width
;
1636 PCUINT8 src2_end1
= reinterpret_cast<PCUINT8
>(src
) + (height
&~3)*src_stride
;
1637 PCUINT8 src2_end2
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
1638 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
1640 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
1642 DstT
*dst2
= reinterpret_cast<DstT
*>(dst_byte
);
1643 for (;src2
<src2_end1
;src2
+=4*src_stride
,dst2
+=4)
1645 __m128 m1
= _mm_set_ps(
1646 *(SrcT
*)(src2
+3*src_stride
),
1647 *(SrcT
*)(src2
+2*src_stride
),
1648 *(SrcT
*)(src2
+src_stride
),
1650 _mm_store_ps(dst2
, m1
);
1652 for (;src2
<src2_end2
;src2
+=src_stride
,dst2
++)
1654 *dst2
= *reinterpret_cast<SrcT
*>(src2
);
1656 float *dst2_end
= reinterpret_cast<DstT
*>(dst_byte
) + dst_width
;
1657 for (;dst2
<dst2_end
;dst2
++)
1664 void float_2_byte_transpose_v0(UINT8
*dst
, int dst_width
, int dst_stride
,
1665 const float *src
, int width
, int height
, int src_stride
)
1667 ASSERT(dst_width
>= height
);
1668 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
1669 const float* src_end
= src
+ width
;
1670 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
1671 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
1673 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
1675 UINT8
*dst2
= reinterpret_cast<UINT8
*>(dst_byte
);
1676 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
1678 *dst2
= static_cast<UINT8
>(*reinterpret_cast<const float*>(src2
)+0.5);
1680 UINT8
*dst2_end
= reinterpret_cast<UINT8
*>(dst_byte
) + dst_width
;
1681 for (;dst2
<dst2_end
;dst2
++)
1688 void float_2_byte_transpose_sse_v0(UINT8
*dst
, int dst_width
, int dst_stride
,
1689 const float *src
, int width
, int height
, int src_stride
)
1692 typedef const float SrcT
;
1694 ASSERT(dst_width
>= height
);
1695 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
1696 SrcT
* src_end
= src
+ width
;
1697 PCUINT8 src2_end00
= reinterpret_cast<PCUINT8
>(src
) + (height
&~15)*src_stride
;
1698 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
1699 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
1701 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
1703 DstT
*dst2
= reinterpret_cast<DstT
*>(dst_byte
);
1704 for (;src2
<src2_end00
;src2
+=16*src_stride
,dst2
+=16)
1706 __m128 m1
= _mm_set_ps(
1707 *(SrcT
*)(src2
+3*src_stride
),
1708 *(SrcT
*)(src2
+2*src_stride
),
1709 *(SrcT
*)(src2
+src_stride
),
1711 __m128 m2
= _mm_set_ps(
1712 *(SrcT
*)(src2
+7*src_stride
),
1713 *(SrcT
*)(src2
+6*src_stride
),
1714 *(SrcT
*)(src2
+5*src_stride
),
1715 *(SrcT
*)(src2
+4*src_stride
));
1716 __m128 m3
= _mm_set_ps(
1717 *(SrcT
*)(src2
+11*src_stride
),
1718 *(SrcT
*)(src2
+10*src_stride
),
1719 *(SrcT
*)(src2
+9*src_stride
),
1720 *(SrcT
*)(src2
+8*src_stride
));
1721 __m128 m4
= _mm_set_ps(
1722 *(SrcT
*)(src2
+15*src_stride
),
1723 *(SrcT
*)(src2
+14*src_stride
),
1724 *(SrcT
*)(src2
+13*src_stride
),
1725 *(SrcT
*)(src2
+12*src_stride
));
1727 __m128i i1
= _mm_cvtps_epi32(m1
);
1728 __m128i i2
= _mm_cvtps_epi32(m2
);
1729 __m128i i3
= _mm_cvtps_epi32(m3
);
1730 __m128i i4
= _mm_cvtps_epi32(m4
);
1732 i1
= _mm_packs_epi32(i1
,i2
);
1733 i3
= _mm_packs_epi32(i3
,i4
);
1734 i1
= _mm_packus_epi16(i1
,i3
);
1736 _mm_store_si128((__m128i
*)dst2
, i1
);
1738 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
1740 *dst2
= static_cast<DstT
>(*reinterpret_cast<SrcT
*>(src2
)+0.5);
1742 DstT
*dst2_end
= reinterpret_cast<DstT
*>(dst_byte
) + dst_width
;
1743 for (;dst2
<dst2_end
;dst2
++)
1750 void byte_2_float_c_v0(float *dst
, int dst_width
, int dst_stride
,
1751 PCUINT8 src
, int width
, int height
, int stride
)
1753 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
1755 PCUINT8 src_end
= src
+ height
*stride
;
1756 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
1759 PCUINT8 src_end
= src2
+ width
;
1760 float *dst2
= reinterpret_cast<float*>(dst_byte
);
1762 for (;src2
<src_end
;src2
++, dst2
++)
1766 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
1767 for (;dst2
<dst2_end
;dst2
++)
1774 void byte_2_float_sse_v0(float *dst
, int dst_width
, int dst_stride
,
1775 PCUINT8 src
, int width
, int height
, int stride
)
1777 ASSERT( dst_width
>=width
);
1778 ASSERT( ((reinterpret_cast<int>(dst
)|dst_stride
)&15)==0 );
1779 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
1781 PCUINT8 src_end
= src
+ height
*stride
;
1782 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
1785 PCUINT8 src2_end0
= src2
+ (width
&~15);
1786 float *dst2
= reinterpret_cast<float*>(dst_byte
);
1788 for (;src2
<src2_end0
;src2
+=16, dst2
+=16)
1791 __m128i src16
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(src2
));
1792 __m128i src16_lo
= _mm_unpacklo_epi8(src16
, _mm_setzero_si128());
1793 __m128i src16_hi
= _mm_unpackhi_epi8(src16
, _mm_setzero_si128());
1794 __m128i src16_lo_lo
= _mm_unpacklo_epi8(src16_lo
, _mm_setzero_si128());
1795 __m128i src16_lo_hi
= _mm_unpackhi_epi8(src16_lo
, _mm_setzero_si128());
1796 __m128i src16_hi_lo
= _mm_unpacklo_epi8(src16_hi
, _mm_setzero_si128());
1797 __m128i src16_hi_hi
= _mm_unpackhi_epi8(src16_hi
, _mm_setzero_si128());
1798 __m128 dst_f1
= _mm_cvtepi32_ps(src16_lo_lo
);
1799 __m128 dst_f2
= _mm_cvtepi32_ps(src16_lo_hi
);
1800 __m128 dst_f3
= _mm_cvtepi32_ps(src16_hi_lo
);
1801 __m128 dst_f4
= _mm_cvtepi32_ps(src16_hi_hi
);
1802 _mm_stream_ps(dst2
, dst_f1
);
1803 _mm_stream_ps(dst2
+4, dst_f2
);
1804 _mm_stream_ps(dst2
+8, dst_f3
);
1805 _mm_stream_ps(dst2
+12, dst_f4
);
1807 PCUINT8 src2_end
= src
+ width
;
1808 for (;src2
<src2_end
;src2
++,dst2
++)
1812 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
1813 for (;dst2
<dst2_end
;dst2
++)
1820 void byte_2_float_sse_v1(float *dst
, int dst_width
, int dst_stride
,
1821 PCUINT8 src
, int width
, int height
, int stride
)
1823 ASSERT( dst_width
>=width
);
1824 ASSERT( ((reinterpret_cast<int>(dst
)|dst_stride
)&15)==0 );
1825 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
1827 PCUINT8 src_end
= src
+ height
*stride
;
1828 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
1831 PCUINT8 src2_end0
= src2
+ (width
&~15);
1832 float *dst2
= reinterpret_cast<float*>(dst_byte
);
1834 for (;src2
<src2_end0
;src2
+=16, dst2
+=16)
1837 __m128i src16
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(src2
));
1838 __m128i src16_lo
= _mm_unpacklo_epi8(src16
, _mm_setzero_si128());
1839 __m128i src16_hi
= _mm_unpackhi_epi8(src16
, _mm_setzero_si128());
1840 __m128i src16_lo_lo
= _mm_unpacklo_epi8(src16_lo
, _mm_setzero_si128());
1841 __m128i src16_lo_hi
= _mm_unpackhi_epi8(src16_lo
, _mm_setzero_si128());
1842 __m128i src16_hi_lo
= _mm_unpacklo_epi8(src16_hi
, _mm_setzero_si128());
1843 __m128i src16_hi_hi
= _mm_unpackhi_epi8(src16_hi
, _mm_setzero_si128());
1844 __m128 dst_f1
= _mm_cvtepi32_ps(src16_lo_lo
);
1845 __m128 dst_f2
= _mm_cvtepi32_ps(src16_lo_hi
);
1846 __m128 dst_f3
= _mm_cvtepi32_ps(src16_hi_lo
);
1847 __m128 dst_f4
= _mm_cvtepi32_ps(src16_hi_hi
);
1848 _mm_store_ps(dst2
, dst_f1
);
1849 _mm_store_ps(dst2
+4, dst_f2
);
1850 _mm_store_ps(dst2
+8, dst_f3
);
1851 _mm_store_ps(dst2
+12, dst_f4
);
1853 PCUINT8 src2_end
= src
+ width
;
1854 for (;src2
<src2_end
;src2
++,dst2
++)
1858 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
1859 for (;dst2
<dst2_end
;dst2
++)
1866 class XyTransposeTest
: public ::testing::Test
1869 static const int MAX_FILTER_LENGTH
= 128;
1870 static const int MAX_DATA_BUFF_SIZE
= 512*128;
1872 UINT8 data_byte_base
[MAX_DATA_BUFF_SIZE
+15];
1875 float data_base_f
[2*MAX_DATA_BUFF_SIZE
+3];
1881 virtual void SetUp()
1885 void FillRandData(int w
, int h
)
1891 this->pitch
= ((w
+3)&~3)*sizeof(float);
1892 ASSERT(this->pitch
*this->h
<= MAX_DATA_BUFF_SIZE
);
1894 data_f
= data_base_f
;
1895 if ((int)(data_f
)&15)
1897 data_f
= (float*)(((int)data_f
+ 15) & ~15);
1899 for (int i
=0;i
<h
;i
++)
1901 float *data2
= (float*)((PUINT8
)data_f
+ i
*pitch
);
1902 for (int j
=0;j
<w
;j
++)
1904 data2
[j
] = rand()*0.1f
/0xFF;
1907 data_f2
= data_f
+ h
*pitch
;
1909 data_b
= data_byte_base
;
1910 if ((int)(data_b
)&15)
1912 data_b
= (UINT8
*)(((int)data_b
+ 15) & ~15);
1918 #define Float2FloatTest(width, height, loop_num, function) \
1919 TEST_F(XyTransposeTest, function ## _ ## width ## _ ## height ## _ ## loop_num ) \
1921 FillRandData(width, height);\
1922 for(int i=0;i<loop_num;i++)\
1924 function(data_f2, height, (width+15)&~15, data_f, width, height, pitch);\
1926 ASSERT_EQ(data_f2[0],data_f2[0]);\
1929 Float2FloatTest( 272, 80, 10000, xy_float_2_float_transpose_c_v0
)
1930 Float2FloatTest( 272, 80, 10000, xy_float_2_float_transpose_sse_v0
)
1933 #define Float2ByteTest(width, height, loop_num, function) \
1934 TEST_F(XyTransposeTest, function ## _ ## width ## _ ## height ## _ ## loop_num ) \
1936 FillRandData(width, height);\
1937 for(int i=0;i<loop_num;i++)\
1939 function(data_b, height, (width+15)&~15, data_f, width, height, pitch);\
1941 ASSERT_EQ(data_b[0],data_b[0]);\
1944 Float2ByteTest( 272, 80, 10000, float_2_byte_transpose_v0
)
1945 Float2ByteTest( 272, 80, 10000, float_2_byte_transpose_sse_v0
)
1948 #define Byte2FloatTest(dst_width, width, height, loop_num, function) \
1949 TEST_F(XyTransposeTest, function ## _ ## width ## _ ## height ## _ ## loop_num ) \
1951 FillRandData(width, height);\
1952 for(int i=0;i<loop_num;i++)\
1954 function(data_f, width, (width+15)&~15, data_b, width, height, pitch);\
1956 ASSERT_EQ(data_b[0],data_b[0]);\
1960 Byte2FloatTest( 272, 256, 64, 10000, byte_2_float_c_v0
)
1961 Byte2FloatTest( 272, 256, 64, 10000, byte_2_float_sse_v0
)
1962 Byte2FloatTest( 272, 256, 64, 10000, byte_2_float_sse_v1
)
1964 #endif // __XY_FILTER_BENCHMARK_F0ED9F75_C0A7_4B0E_910E_7A82ECE2D7AD_H__