Blur Optimization. [Part 7]
[xy_vsfilter.git] / test / unit_test / xy_filter_benchmark.h
blobcd26e59d2befd246933c9f91f616c28ffd38331b
1 #ifndef __XY_FILTER_BENCHMARK_F0ED9F75_C0A7_4B0E_910E_7A82ECE2D7AD_H__
2 #define __XY_FILTER_BENCHMARK_F0ED9F75_C0A7_4B0E_910E_7A82ECE2D7AD_H__
4 #include <gtest/gtest.h>
5 #include <wtypes.h>
6 #include <math.h>
7 #include <xmmintrin.h>
8 #include "xy_malloc.h"
10 typedef const UINT8 CUINT8, *PCUINT8;
12 //////////////////////////////////////////////////////////////////////
14 // transpose test
16 void xy_filter_c_v0(float *dst, int width, int height, int stride, const float *filter, int filter_width)
18 ASSERT( stride>=4*(width+filter_width) );
20 int xx_fix = width > filter_width ? 0 : filter_width - width;
21 const float *filter_start = filter;
23 BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
24 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
25 for( ; dst_byte<end; dst_byte+=stride )
27 float *dst_f = reinterpret_cast<float*>(dst_byte);
28 float *dst2 = dst_f - filter_width;
29 float *dst_endr = dst_f + width;
30 float *dst_end0 = dst_endr - filter_width;
31 float *dst_endl = dst_f - xx_fix;
32 ASSERT(xx_fix==0 || dst_end0==dst_endl);
34 ASSERT(filter_start == filter);
35 filter_start += filter_width;
36 const float *filter_end = filter_start;
37 for (;dst2<dst_endl;dst2++, filter_start--)//left margin
39 const float *src = dst_f;
40 float sum = 0;
41 for(const float* f=filter_start;f<filter_end;f++, src++)
43 sum += src[0] * f[0];
45 *dst2 = sum;
47 for (;dst2<dst_f;dst2++, filter_start--, filter_end--)//if width < filter_width
49 const float *src = dst_f;
50 float sum = 0;
51 for(const float* f=filter_start;f<filter_end;f++, src++)
53 sum += src[0] * f[0];
55 *dst2 = sum;
57 ASSERT(filter_start==filter);
58 for (;dst2<dst_end0;dst2++)
60 const float *src = dst2;
62 float sum = 0;
63 for(const float* f=filter_start;f<filter_end;f++, src++)
65 sum += src[0] * f[0];
67 *dst2 = sum;
69 for (;dst2<dst_endr;dst2++, filter_end--)//right margin
71 const float *src = dst2;
72 float sum = 0;
73 for(const float* f=filter;f<filter_end;f++, src++)
75 sum += src[0] * f[0];
77 *dst2 = sum;
82 void xy_filter_sse_v0(float *dst, int width, int height, int stride, const float *filter, int filter_width)
84 #ifdef XY_FILTER_4
85 # undef XY_FILTER_4
86 #endif
87 ASSERT( stride>=4*(width+filter_width) );
88 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
90 int xx_fix = width > filter_width ? 0 : filter_width - width;
91 const float *filter_start = filter;
92 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
93 BYTE* end = dst_byte + height*stride;
94 for( ; dst_byte<end; dst_byte+=stride )
96 float *dst_f = reinterpret_cast<float*>(dst_byte);
97 float *dst2 = dst_f - filter_width;
98 float *dst_endr = dst_f + width;
99 float *dst_end0 = dst_endr - filter_width;
100 float *dst_endl = dst_f - xx_fix;
101 ASSERT(xx_fix==0 || dst_end0==dst_endl);
103 ASSERT(filter_start == filter);
104 filter_start += filter_width;
105 const float *filter_end = filter_start;
107 for (;dst2<dst_endl;dst2+=4)//left margin
109 const float *src = dst_f;
110 filter_start -= 4;
112 //filter 4
113 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
114 __m128 sum = _mm_setzero_ps();
115 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
117 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
118 __m128 f4 = _mm_load_ps(f);
120 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
121 __m128 f4_1 = _mm_unpacklo_ps(f4,f4);\
122 f4_1 = _mm_unpacklo_ps(f4_1, f4_1);\
123 f4_1 = _mm_mul_ps(f4_1, src4);\
124 sum = _mm_add_ps(sum, f4_1);\
125 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
126 f4_1 = _mm_unpackhi_ps(f4,f4);\
127 f4_1 = _mm_unpacklo_ps(f4_1,f4_1);\
128 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
129 sum = _mm_add_ps(sum, f4_1);\
130 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
131 f4_1 = _mm_unpacklo_ps(f4,f4);\
132 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
133 f4_1 = _mm_mul_ps(f4_1, src4);\
134 sum = _mm_add_ps(sum, f4_1);\
135 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
136 f4_1 = _mm_unpackhi_ps(f4,f4);\
137 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
138 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
139 sum = _mm_add_ps(sum, f4_1)
141 { XY_FILTER_4(src4, src_5_8, f4, sum); }
142 src4 = src_5_8;
144 //store result
145 _mm_stream_ps(dst2, sum);
147 for (;dst2<dst_f;dst2+=4)//if width < filter_width
149 const float *src = dst_f;
150 filter_start-=4;
151 filter_end-=4;
153 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
154 __m128 sum = _mm_setzero_ps();
155 __m128 src_5_8, f4;
156 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
158 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
159 f4 = _mm_load_ps(f);
161 { XY_FILTER_4(src4, src_5_8, f4, sum); }
162 src4 = src_5_8;
164 src_5_8 = _mm_setzero_ps();
165 f4 = _mm_load_ps(filter_end);
166 { XY_FILTER_4(src4, src_5_8, f4, sum); }
167 //store result
168 _mm_stream_ps(dst2, sum);
170 ASSERT(filter_start == filter);
171 for (;dst2<dst_end0;dst2+=4)
173 const float *src = dst2;
175 //filter 4
176 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
177 __m128 sum = _mm_setzero_ps();
178 for(const float* f=filter_start;f<filter_end;f+=4)
180 src+=4;
181 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
182 __m128 f4 = _mm_load_ps(f);
184 { XY_FILTER_4(src4, src_5_8, f4, sum); }
185 src4 = src_5_8;
187 //store result
188 _mm_stream_ps(dst2, sum);
190 for (;dst2<dst_endr;dst2+=4)//right margin
192 const float *src = dst2;
193 filter_end-=4;
195 //filter 4
196 __m128 src4 = _mm_load_ps(src);//1 2 3 4
197 __m128 sum = _mm_setzero_ps();
198 __m128 src_5_8, f4;
199 for(const float* f=filter_start;f<filter_end;f+=4)
201 src+=4;
202 src_5_8 = _mm_load_ps(src);//5 6 7 8
203 f4 = _mm_load_ps(f);
205 { XY_FILTER_4(src4, src_5_8, f4, sum); }
207 src4 = src_5_8;
208 //move new 4 in_n_out to old 4 in_n_out
210 src_5_8 = _mm_setzero_ps();
211 f4 = _mm_load_ps(filter_end);
212 { XY_FILTER_4(src4, src_5_8, f4, sum); }
213 //store result
214 _mm_stream_ps(dst2, sum);
217 #undef XY_FILTER_4
220 void xy_filter_sse_v1(float *dst, int width, int height, int stride, const float *filter, int filter_width)
222 #ifdef XY_FILTER_4
223 # undef XY_FILTER_4
224 #endif
225 ASSERT( stride>=4*(width+filter_width) );
226 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
228 int xx_fix = width > filter_width ? 0 : filter_width - width;
229 const float *filter_start = filter;
230 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
231 BYTE* end = dst_byte + height*stride;
232 for( ; dst_byte<end; dst_byte+=stride )
234 float *dst_f = reinterpret_cast<float*>(dst_byte);
235 float *dst2 = dst_f - filter_width;
236 float *dst_endr = dst_f + width;
237 float *dst_end0 = dst_endr - filter_width;
238 float *dst_endl = dst_f - xx_fix;
239 ASSERT(xx_fix==0 || dst_end0==dst_endl);
241 ASSERT(filter_start == filter);
242 filter_start += filter_width;
243 const float *filter_end = filter_start;
245 for (;dst2<dst_endl;dst2+=4)//left margin
247 const float *src = dst_f;
248 filter_start -= 4;
250 //filter 4
251 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
252 __m128 sum = _mm_setzero_ps();
253 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
255 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
256 __m128 f4 = _mm_load_ps(f);
258 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
259 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
260 f4_1 = _mm_mul_ps(f4_1, src4);\
261 sum = _mm_add_ps(sum, f4_1);\
262 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
263 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
264 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
265 sum = _mm_add_ps(sum, f4_1);\
266 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
267 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
268 f4_1 = _mm_mul_ps(f4_1, src4);\
269 sum = _mm_add_ps(sum, f4_1);\
270 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
271 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
272 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
273 sum = _mm_add_ps(sum, f4_1)
275 { XY_FILTER_4(src4, src_5_8, f4, sum); }
276 src4 = src_5_8;
278 //store result
279 _mm_stream_ps(dst2, sum);
281 for (;dst2<dst_f;dst2+=4)//if width < filter_width
283 const float *src = dst_f;
284 filter_start-=4;
285 filter_end-=4;
287 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
288 __m128 sum = _mm_setzero_ps();
289 __m128 src_5_8, f4;
290 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
292 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
293 f4 = _mm_load_ps(f);
295 { XY_FILTER_4(src4, src_5_8, f4, sum); }
296 src4 = src_5_8;
298 src_5_8 = _mm_setzero_ps();
299 f4 = _mm_load_ps(filter_end);
300 { XY_FILTER_4(src4, src_5_8, f4, sum); }
301 //store result
302 _mm_stream_ps(dst2, sum);
304 ASSERT(filter_start == filter);
305 for (;dst2<dst_end0;dst2+=4)
307 const float *src = dst2;
309 //filter 4
310 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
311 __m128 sum = _mm_setzero_ps();
312 for(const float* f=filter_start;f<filter_end;f+=4)
314 src+=4;
315 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
316 __m128 f4 = _mm_load_ps(f);
318 { XY_FILTER_4(src4, src_5_8, f4, sum); }
319 src4 = src_5_8;
321 //store result
322 _mm_stream_ps(dst2, sum);
324 for (;dst2<dst_endr;dst2+=4)//right margin
326 const float *src = dst2;
327 filter_end-=4;
329 //filter 4
330 __m128 src4 = _mm_load_ps(src);//1 2 3 4
331 __m128 sum = _mm_setzero_ps();
332 __m128 src_5_8, f4;
333 for(const float* f=filter_start;f<filter_end;f+=4)
335 src+=4;
336 src_5_8 = _mm_load_ps(src);//5 6 7 8
337 f4 = _mm_load_ps(f);
339 { XY_FILTER_4(src4, src_5_8, f4, sum); }
341 src4 = src_5_8;
342 //move new 4 in_n_out to old 4 in_n_out
344 src_5_8 = _mm_setzero_ps();
345 f4 = _mm_load_ps(filter_end);
346 { XY_FILTER_4(src4, src_5_8, f4, sum); }
347 //store result
348 _mm_stream_ps(dst2, sum);
351 #undef XY_FILTER_4
354 void xy_filter_sse_v2(float *dst, int width, int height, int stride, const float *filter, int filter_width)
356 #ifdef XY_FILTER_4
357 # undef XY_FILTER_4
358 #endif
359 ASSERT( stride>=4*(width+filter_width) );
360 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
362 int xx_fix = width > filter_width ? 0 : filter_width - width;
363 const float *filter_start = filter;
364 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
365 BYTE* end = dst_byte + height*stride;
366 for( ; dst_byte<end; dst_byte+=stride )
368 float *dst_f = reinterpret_cast<float*>(dst_byte);
369 float *dst2 = dst_f - filter_width;
370 float *dst_endr = dst_f + width;
371 float *dst_end0 = dst_endr - filter_width;
372 float *dst_endl = dst_f - xx_fix;
373 ASSERT(xx_fix==0 || dst_end0==dst_endl);
375 ASSERT(filter_start == filter);
376 filter_start += filter_width;
377 const float *filter_end = filter_start;
379 for (;dst2<dst_endl;dst2+=4)//left margin
381 const float *src = dst_f;
382 filter_start -= 4;
384 //filter 4
385 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
386 __m128 sum = _mm_setzero_ps();
387 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
389 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
390 __m128 f4 = _mm_load_ps(f);
392 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
393 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
394 f4_1 = _mm_mul_ps(f4_1, src4);\
395 sum = _mm_add_ps(sum, f4_1);\
396 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
397 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
398 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
399 sum = _mm_add_ps(sum, f4_1);\
400 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
401 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
402 f4_1 = _mm_mul_ps(f4_1, src4);\
403 sum = _mm_add_ps(sum, f4_1);\
404 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
405 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
406 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
407 sum = _mm_add_ps(sum, f4_1)
409 { XY_FILTER_4(src4, src_5_8, f4, sum); }
410 src4 = src_5_8;
412 //store result
413 _mm_store_ps(dst2, sum);
415 for (;dst2<dst_f;dst2+=4)//if width < filter_width
417 const float *src = dst_f;
418 filter_start-=4;
419 filter_end-=4;
421 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
422 __m128 sum = _mm_setzero_ps();
423 __m128 src_5_8, f4;
424 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
426 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
427 f4 = _mm_load_ps(f);
429 { XY_FILTER_4(src4, src_5_8, f4, sum); }
430 src4 = src_5_8;
432 src_5_8 = _mm_setzero_ps();
433 f4 = _mm_load_ps(filter_end);
434 { XY_FILTER_4(src4, src_5_8, f4, sum); }
435 //store result
436 _mm_store_ps(dst2, sum);
438 ASSERT(filter_start == filter);
439 for (;dst2<dst_end0;dst2+=4)
441 const float *src = dst2;
443 //filter 4
444 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
445 __m128 sum = _mm_setzero_ps();
446 for(const float* f=filter_start;f<filter_end;f+=4)
448 src+=4;
449 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
450 __m128 f4 = _mm_load_ps(f);
452 { XY_FILTER_4(src4, src_5_8, f4, sum); }
453 src4 = src_5_8;
455 //store result
456 _mm_store_ps(dst2, sum);
458 for (;dst2<dst_endr;dst2+=4)//right margin
460 const float *src = dst2;
461 filter_end-=4;
463 //filter 4
464 __m128 src4 = _mm_load_ps(src);//1 2 3 4
465 __m128 sum = _mm_setzero_ps();
466 __m128 src_5_8, f4;
467 for(const float* f=filter_start;f<filter_end;f+=4)
469 src+=4;
470 src_5_8 = _mm_load_ps(src);//5 6 7 8
471 f4 = _mm_load_ps(f);
473 { XY_FILTER_4(src4, src_5_8, f4, sum); }
475 src4 = src_5_8;
476 //move new 4 in_n_out to old 4 in_n_out
478 src_5_8 = _mm_setzero_ps();
479 f4 = _mm_load_ps(filter_end);
480 { XY_FILTER_4(src4, src_5_8, f4, sum); }
481 //store result
482 _mm_store_ps(dst2, sum);
485 #undef XY_FILTER_4
488 static __forceinline void xy_filter_4_inline(__m128& src4, const __m128& src_5_8, const __m128& f4, __m128& sum)
490 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));
491 f4_1 = _mm_mul_ps(f4_1, src4);
492 sum = _mm_add_ps(sum, f4_1);
493 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/
494 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));
495 f4_1 = _mm_mul_ps(f4_1, src_3_6);
496 sum = _mm_add_ps(sum, f4_1);
497 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/
498 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));
499 f4_1 = _mm_mul_ps(f4_1, src4);
500 sum = _mm_add_ps(sum, f4_1);
501 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/
502 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));
503 f4_1 = _mm_mul_ps(f4_1, src_3_6);
504 sum = _mm_add_ps(sum, f4_1);
507 void xy_filter_sse_v3(float *dst, int width, int height, int stride, const float *filter, int filter_width)
509 ASSERT( stride>=4*(width+filter_width) );
510 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
512 int xx_fix = width > filter_width ? 0 : filter_width - width;
513 const float *filter_start = filter;
514 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
515 BYTE* end = dst_byte + height*stride;
516 for( ; dst_byte<end; dst_byte+=stride )
518 float *dst_f = reinterpret_cast<float*>(dst_byte);
519 float *dst2 = dst_f - filter_width;
520 float *dst_endr = dst_f + width;
521 float *dst_end0 = dst_endr - filter_width;
522 float *dst_endl = dst_f - xx_fix;
523 ASSERT(xx_fix==0 || dst_end0==dst_endl);
525 ASSERT(filter_start == filter);
526 filter_start += filter_width;
527 const float *filter_end = filter_start;
529 for (;dst2<dst_endl;dst2+=4)//left margin
531 const float *src = dst_f;
532 filter_start -= 4;
534 //filter 4
535 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
536 __m128 sum = _mm_setzero_ps();
537 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
539 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
540 __m128 f4 = _mm_load_ps(f);
542 xy_filter_4_inline(src4, src_5_8, f4, sum);
544 src4 = src_5_8;
546 //store result
547 _mm_store_ps(dst2, sum);
549 for (;dst2<dst_f;dst2+=4)//if width < filter_width
551 const float *src = dst_f;
552 filter_start-=4;
553 filter_end-=4;
555 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
556 __m128 sum = _mm_setzero_ps();
557 __m128 src_5_8, f4;
558 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
560 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
561 f4 = _mm_load_ps(f);
563 xy_filter_4_inline(src4, src_5_8, f4, sum);
564 src4 = src_5_8;
566 src_5_8 = _mm_setzero_ps();
567 f4 = _mm_load_ps(filter_end);
568 xy_filter_4_inline(src4, src_5_8, f4, sum);
569 //store result
570 _mm_store_ps(dst2, sum);
572 ASSERT(filter_start == filter);
573 for (;dst2<dst_end0;dst2+=4)
575 const float *src = dst2;
577 //filter 4
578 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
579 __m128 sum = _mm_setzero_ps();
580 for(const float* f=filter_start;f<filter_end;f+=4)
582 src+=4;
583 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
584 __m128 f4 = _mm_load_ps(f);
586 xy_filter_4_inline(src4, src_5_8, f4, sum);
587 src4 = src_5_8;
589 //store result
590 _mm_store_ps(dst2, sum);
592 for (;dst2<dst_endr;dst2+=4)//right margin
594 const float *src = dst2;
595 filter_end-=4;
597 //filter 4
598 __m128 src4 = _mm_load_ps(src);//1 2 3 4
599 __m128 sum = _mm_setzero_ps();
600 __m128 src_5_8, f4;
601 for(const float* f=filter_start;f<filter_end;f+=4)
603 src+=4;
604 src_5_8 = _mm_load_ps(src);//5 6 7 8
605 f4 = _mm_load_ps(f);
607 xy_filter_4_inline(src4, src_5_8, f4, sum);
609 src4 = src_5_8;
610 //move new 4 in_n_out to old 4 in_n_out
612 src_5_8 = _mm_setzero_ps();
613 f4 = _mm_load_ps(filter_end);
614 xy_filter_4_inline(src4, src_5_8, f4, sum);
615 //store result
616 _mm_store_ps(dst2, sum);
621 /****
622 * inline function sometimes generates stupid code
624 * @src4, @src_5_8, @f4, @sum : __m128
625 * @src_5_8, @f4: const
626 * @sum : output
627 * @src4: undefined
628 **/
629 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
630 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
631 f4_1 = _mm_mul_ps(f4_1, src4);\
632 sum = _mm_add_ps(sum, f4_1);\
633 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
634 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
635 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
636 sum = _mm_add_ps(sum, f4_1);\
637 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
638 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
639 f4_1 = _mm_mul_ps(f4_1, src4);\
640 sum = _mm_add_ps(sum, f4_1);\
641 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
642 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
643 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
644 sum = _mm_add_ps(sum, f4_1)
647 __forceinline void xy_filter_one_line_sse_v4(float *dst, int width, const float *filter, int filter_width)
649 int xx_fix = width > filter_width ? 0 : filter_width - width;
650 const float *filter_start = filter;
651 float *dst2 = dst - filter_width;
652 float *dst_endr = dst + width;
653 float *dst_end0 = dst_endr - filter_width;
654 float *dst_endl = dst - xx_fix;
655 ASSERT(xx_fix==0 || dst_end0==dst_endl);
657 ASSERT(filter_start == filter);
658 filter_start += filter_width;
659 const float *filter_end = filter_start;
661 for (;dst2<dst_endl;dst2+=4)//left margin
663 const float *src = dst;
664 filter_start -= 4;
666 //filter 4
667 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
668 __m128 sum = _mm_setzero_ps();
669 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
671 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
672 __m128 f4 = _mm_load_ps(f);
674 { XY_FILTER_4(src4, src_5_8, f4, sum); }
676 src4 = src_5_8;
678 //store result
679 _mm_store_ps(dst2, sum);
681 for (;dst2<dst;dst2+=4)//if width < filter_width
683 const float *src = dst;
684 filter_start-=4;
685 filter_end-=4;
687 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
688 __m128 sum = _mm_setzero_ps();
689 __m128 src_5_8, f4;
690 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
692 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
693 f4 = _mm_load_ps(f);
695 { XY_FILTER_4(src4, src_5_8, f4, sum); }
696 src4 = src_5_8;
698 src_5_8 = _mm_setzero_ps();
699 f4 = _mm_load_ps(filter_end);
700 { XY_FILTER_4(src4, src_5_8, f4, sum); }
701 //store result
702 _mm_store_ps(dst2, sum);
704 ASSERT(filter_start == filter);
705 for (;dst2<dst_end0;dst2+=4)
707 const float *src = dst2;
709 //filter 4
710 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
711 __m128 sum = _mm_setzero_ps();
712 for(const float* f=filter_start;f<filter_end;f+=4)
714 src+=4;
715 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
716 __m128 f4 = _mm_load_ps(f);
718 { XY_FILTER_4(src4, src_5_8, f4, sum); }
719 src4 = src_5_8;
721 //store result
722 _mm_store_ps(dst2, sum);
724 for (;dst2<dst_endr;dst2+=4)//right margin
726 const float *src = dst2;
727 filter_end-=4;
729 //filter 4
730 __m128 src4 = _mm_load_ps(src);//1 2 3 4
731 __m128 sum = _mm_setzero_ps();
732 __m128 src_5_8, f4;
733 for(const float* f=filter_start;f<filter_end;f+=4)
735 src+=4;
736 src_5_8 = _mm_load_ps(src);//5 6 7 8
737 f4 = _mm_load_ps(f);
739 { XY_FILTER_4(src4, src_5_8, f4, sum); }
741 src4 = src_5_8;
742 //move new 4 in_n_out to old 4 in_n_out
744 src_5_8 = _mm_setzero_ps();
745 f4 = _mm_load_ps(filter_end);
746 { XY_FILTER_4(src4, src_5_8, f4, sum); }
747 //store result
748 _mm_store_ps(dst2, sum);
752 void xy_filter_sse_v4(float *dst, int width, int height, int stride, const float *filter, int filter_width)
754 ASSERT( stride>=4*(width+filter_width) );
755 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
757 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
758 BYTE* end = dst_byte + height*stride;
759 for( ; dst_byte<end; dst_byte+=stride )
761 xy_filter_one_line_sse_v4(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
765 // v5
766 template<int LENGTH>
767 struct M128s
769 __m128 x;
770 M128s<LENGTH - 4> next;
772 template<int Index> __forceinline __m128& GetAt()
774 return next.GetAt<Index - 4>();
776 template<> __forceinline __m128& GetAt<0>()
778 return x;
781 template<int Start, int Offset> __forceinline __m128& GetAt()
783 return GetAt<Start + Offset>();
786 __forceinline void Load(const float* src)
788 x = _mm_load_ps(src);
789 next.Load(src+4);
793 template<>
794 struct M128s<0>
796 void Load(const float* src)
801 template<int FILTER_LENGTH, int START, int LENGTH>
802 struct Filter4
804 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
806 Filter4<FILTER_LENGTH,START,LENGTH-4>::do_cal(src0_128, src4, filter128s, sum);
807 __m128 src4_128 = _mm_load_ps(src4+LENGTH-4);
808 XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START+LENGTH-4>(), sum);
809 src0_128 = src4_128;
813 template<int FILTER_LENGTH, int START>
814 struct Filter4<FILTER_LENGTH, START, 0>
816 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
821 template<int FILTER_LENGTH,int MARGIN_LENGTH>
822 struct FilterAllMargin
824 static __forceinline void cal_left_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
826 //filter 4
827 __m128 src0 = _mm_setzero_ps();
828 __m128 sum = _mm_setzero_ps();
829 Filter4<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
830 _mm_store_ps(src-MARGIN_LENGTH, sum);
831 FilterAllMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::cal_left_margin(src, filter128s);
834 static __forceinline void cal_right_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
836 //filter 4
838 __m128 src0 = _mm_load_ps(src);
839 __m128 sum = _mm_setzero_ps();
840 Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
841 __m128 src4 = _mm_setzero_ps();
842 XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum);
843 //store result
844 _mm_store_ps(src, sum);
846 FilterAllMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::cal_right_margin(src+4, filter128s);
850 template<int FILTER_LENGTH>
851 struct FilterAllMargin<FILTER_LENGTH,0>
853 static __forceinline void cal_left_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
856 static __forceinline void cal_right_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
861 /****
862 * See @xy_filter_c
863 * Constrain:
864 * FILTER_LENGTH%4 == 0 && FILTER_LENGTH<=width
866 template<int FILTER_LENGTH>
867 void xy_filter_sse_template_v0(float *dst, int width, int height, int stride, const float *filter)
869 ASSERT( stride>=4*(width+FILTER_LENGTH) );
870 ASSERT( ((stride|(4*width)|(4*FILTER_LENGTH)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
872 M128s<FILTER_LENGTH> filter128s;
873 filter128s.Load(filter);
875 const float *filter_start = filter;
876 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
877 BYTE* end = dst_byte + height*stride;
878 for( ; dst_byte<end; dst_byte+=stride )
880 float *dst2 = reinterpret_cast<float*>(dst_byte);
882 //left margin
883 FilterAllMargin<FILTER_LENGTH,FILTER_LENGTH>::cal_left_margin(dst2, filter128s);
884 float *dst_end1 = dst2 + width;
885 float *dst_end0 = dst_end1 - FILTER_LENGTH;
886 for (;dst2<dst_end0;dst2+=4)
888 const float *src = dst2;
890 //filter 4
891 __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
892 src += 4;
893 __m128 sum = _mm_setzero_ps();
894 Filter4<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
895 //store result
896 _mm_store_ps(dst2, sum);
898 FilterAllMargin<FILTER_LENGTH,FILTER_LENGTH>::cal_right_margin(dst2, filter128s);
902 void xy_filter_sse_v5(float *dst, int width, int height, int stride, const float *filter, int filter_width)
904 typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
905 const Filter filters[] = { xy_filter_sse_template_v0<4>, xy_filter_sse_template_v0<8>, xy_filter_sse_template_v0<12>, xy_filter_sse_template_v0<16>,
906 xy_filter_sse_template_v0<20>, xy_filter_sse_template_v0<24>, xy_filter_sse_template_v0<28>
908 if (filter_width<=28 && filter_width<=width)
910 ASSERT(filter_width%4==0);
911 filters[(filter_width-1)/4](dst, width, height, stride, filter);
913 else
915 xy_filter_sse_v4(dst, width, height, stride, filter, filter_width);
919 // v5 end
921 __forceinline void xy_filter_one_line_sse_v6(float *dst, int width, const float *filter, int filter_width)
923 int xx_fix = width > filter_width ? 0 : filter_width - width;
924 const float *filter_start = filter;
925 float *dst2 = dst - filter_width;
926 float *dst_endr = dst + width;
927 float *dst_end0 = dst_endr - filter_width - 4;
928 float *dst_endl = dst - xx_fix;
929 ASSERT(xx_fix==0 || dst_end0==dst_endl-4);
931 ASSERT(filter_start == filter);
932 filter_start += filter_width;
933 const float *filter_end = filter_start;
935 for (;dst2<dst_endl;dst2+=4)//left margin
937 const float *src = dst;
938 filter_start -= 4;
940 //filter 4
941 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
942 __m128 sum = _mm_setzero_ps();
943 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
945 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
946 __m128 f4 = _mm_load_ps(f);
948 { XY_FILTER_4(src4, src_5_8, f4, sum); }
950 src4 = src_5_8;
952 //store result
953 _mm_store_ps(dst2, sum);
955 for (;dst2<dst;dst2+=4)//if width < filter_width
957 const float *src = dst;
958 filter_start-=4;
959 filter_end-=4;
961 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
962 __m128 sum = _mm_setzero_ps();
963 __m128 src_5_8, f4;
964 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
966 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
967 f4 = _mm_load_ps(f);
969 { XY_FILTER_4(src4, src_5_8, f4, sum); }
970 src4 = src_5_8;
972 src_5_8 = _mm_setzero_ps();
973 f4 = _mm_load_ps(filter_end);
974 { XY_FILTER_4(src4, src_5_8, f4, sum); }
975 //store result
976 _mm_store_ps(dst2, sum);
978 ASSERT(filter_start == filter);
979 for (;dst2<dst_end0;dst2+=8)
981 const float *src = dst2;
982 const float* f=filter_start;
984 //filter 4
985 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
986 src+=4;
987 __m128 src_5_8;
988 __m128 sum = _mm_setzero_ps();
989 __m128 sum2 = _mm_setzero_ps();
990 __m128 f4 = _mm_load_ps(f);
991 f+=4;
992 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
993 src+=4;
994 { XY_FILTER_4(src4, src_5_8, f4, sum); }
995 for(;f<filter_end;f+=4,src+=4)
997 src4 = _mm_load_ps(src);/*1 2 3 4*/
998 __m128 tmp = src_5_8;//important!
999 { XY_FILTER_4(tmp, src4, f4, sum2); }
1001 f4 = _mm_load_ps(f);
1002 { XY_FILTER_4(src_5_8, src4, f4, sum); }
1003 src_5_8 = src4;
1005 src4 = _mm_load_ps(src);/*1 2 3 4*/
1006 { XY_FILTER_4(src_5_8, src4, f4, sum2); }
1008 //store result
1009 _mm_store_ps(dst2, sum);
1010 _mm_store_ps(dst2+4, sum2);
1012 if (dst2==dst_end0)
1014 const float *src = dst2;
1015 //filter 4
1016 __m128 src4 = _mm_load_ps(src);//1 2 3 4
1017 src+=4;
1018 __m128 sum = _mm_setzero_ps();
1019 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
1021 __m128 src_5_8 = _mm_load_ps(src);//5 6 7 8
1022 __m128 f4 = _mm_load_ps(f);
1024 { XY_FILTER_4(src4, src_5_8, f4, sum); }
1025 src4 = src_5_8;
1027 //store result
1028 _mm_store_ps(dst2, sum);
1029 dst2+=4;
1031 for (;dst2<dst_endr;dst2+=4)//right margin
1033 const float *src = dst2;
1034 filter_end-=4;
1036 //filter 4
1037 __m128 src4 = _mm_load_ps(src);//1 2 3 4
1038 __m128 sum = _mm_setzero_ps();
1039 __m128 src_5_8, f4;
1040 for(const float* f=filter_start;f<filter_end;f+=4)
1042 src+=4;
1043 src_5_8 = _mm_load_ps(src);//5 6 7 8
1044 f4 = _mm_load_ps(f);
1046 { XY_FILTER_4(src4, src_5_8, f4, sum); }
1048 src4 = src_5_8;
1049 //move new 4 in_n_out to old 4 in_n_out
1051 src_5_8 = _mm_setzero_ps();
1052 f4 = _mm_load_ps(filter_end);
1053 { XY_FILTER_4(src4, src_5_8, f4, sum); }
1054 //store result
1055 _mm_store_ps(dst2, sum);
1059 void xy_filter_sse_v6(float *dst, int width, int height, int stride, const float *filter, int filter_width)
1061 ASSERT( stride>=4*(width+filter_width) );
1062 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
1064 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
1065 BYTE* end = dst_byte + height*stride;
1066 for( ; dst_byte<end; dst_byte+=stride )
1068 xy_filter_one_line_sse_v6(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
1072 // v7
1075 // ref: "Comparing floating point numbers" by Bruce Dawson
1076 // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
1078 bool AlmostEqual(float A, float B, int maxUlps=0)
1080 // Make sure maxUlps is non-negative and small enough that the
1081 // default NAN won't compare as equal to anything.
1082 ASSERT(maxUlps > 0 && maxUlps < 4 * 1024 * 1024);
1083 int aInt = *(int*)&A;
1084 // Make aInt lexicographically ordered as a twos-complement int
1085 if (aInt < 0)
1086 aInt = 0x80000000 - aInt;
1087 // Make bInt lexicographically ordered as a twos-complement int
1088 int bInt = *(int*)&B;
1089 if (bInt < 0)
1090 bInt = 0x80000000 - bInt;
1092 int intDiff = abs(aInt - bInt);
1093 if (intDiff <= maxUlps)
1094 return true;
1096 return false;
1099 /****
1100 * @src4, @f4_1, @sum : __m128
1101 * @f4_1: const
1102 * @sum : output
1103 * @src4: undefined
1104 **/
1105 #define XY_FILTER_4_1(src4, f4_1, sum) \
1106 src4 = _mm_mul_ps(src4, f4_1);\
1107 sum = _mm_add_ps(sum, src4);
1109 /****
1110 * Constrain:
1111 * LENGTH%4 == 0 || LENGTH%4 == 1
1113 template<int LENGTH>
1114 struct M128s_V1
1116 __m128 x;
1117 M128s_V1<LENGTH - 4> next;
1119 template<int Index> __forceinline __m128& GetAt()
1121 return next.GetAt<Index - 4>();
1123 template<> __forceinline __m128& GetAt<0>()
1125 return x;
1128 template<int Start, int Offset> __forceinline __m128& GetAt()
1130 return GetAt<Start + Offset>();
1133 __forceinline void Load(const float* src)
1135 x = _mm_load_ps(src);
1136 next.Load(src+4);
1140 template<>
1141 struct M128s_V1<1>
1143 __m128 x;
1145 template<int Index> __forceinline __m128& GetAt()
1147 return x;
1149 __forceinline void Load(const float* src)
1151 x = _mm_set1_ps(*src);
1155 template<>
1156 struct M128s_V1<0>
1158 void Load(const float* src)
1163 template<int FILTER_LENGTH, int START, int LENGTH>
1164 struct Filter4_V1
1166 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1168 __m128 src4_128 = _mm_load_ps(src4);
1169 { XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START>(), sum); }
1170 Filter4_V1<FILTER_LENGTH,START+4,LENGTH-4>::do_cal(src4_128, src4+4, filter128s, sum);
1171 src0_128 = src4_128;
1175 template<int FILTER_LENGTH, int START>
1176 struct Filter4_V1<FILTER_LENGTH, START, 1>
1178 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1180 cal_tail<FILTER_LENGTH-START>(src0_128, src4, filter128s, sum);
1182 template<int TAIL>
1183 static __forceinline void cal_tail(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1187 template<>
1188 static __forceinline void cal_tail<1>(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1190 { XY_FILTER_4_1(src0_128, filter128s.GetAt<FILTER_LENGTH-1>(), sum); }
1194 template<int FILTER_LENGTH, int START>
1195 struct Filter4_V1<FILTER_LENGTH, START, 0>
1197 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1202 template<int FILTER_LENGTH,int MARGIN_LENGTH>
1203 struct FilterAllLeftMargin_V1
1205 static __forceinline void cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1207 do_cal<FILTER_LENGTH%4>(src, filter128s);
1209 template<int FILTER_TAIL>
1210 static __forceinline void do_cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1212 //filter 4
1213 __m128 src0 = _mm_setzero_ps();
1214 __m128 sum = _mm_setzero_ps();
1215 Filter4_V1<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
1216 _mm_store_ps(src-MARGIN_LENGTH, sum);
1217 FilterAllLeftMargin_V1<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
1219 template<>
1220 static __forceinline void do_cal<1>(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1222 //filter 4
1223 __m128 sum = _mm_setzero_ps();
1224 //Only one of the last 4 filter coefficiences is non-zero
1225 _mm_store_ps(src-MARGIN_LENGTH, sum);
1226 FilterAllLeftMargin_V1<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
1230 template<int FILTER_LENGTH, int MARGIN_LENGTH>
1231 struct FilterAllRightMargin_V1
1233 static __forceinline void cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1235 do_cal<FILTER_LENGTH%4>(src, filter128s);
1237 template<int FILTER_TAIL>
1238 static __forceinline void do_cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1240 //filter 4
1242 __m128 src0 = _mm_load_ps(src);
1243 __m128 sum = _mm_setzero_ps();
1244 Filter4_V1<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
1245 __m128 src4 = _mm_setzero_ps();
1246 { XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
1247 //store result
1248 _mm_store_ps(src, sum);
1250 FilterAllRightMargin_V1<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
1252 template<>
1253 static __forceinline void do_cal<1>(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1255 //filter 4
1257 __m128 src0 = _mm_load_ps(src);
1258 __m128 sum = _mm_setzero_ps();
1259 Filter4_V1<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
1260 //Only one of the last 4 filter coefficiences is non-zero
1261 { XY_FILTER_4_1(src0, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
1262 //store result
1263 _mm_store_ps(src, sum);
1265 FilterAllRightMargin_V1<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
1269 template<int FILTER_LENGTH>
1270 struct FilterAllLeftMargin_V1<FILTER_LENGTH,0>
1272 template<int FILTER_TAIL>
1273 static __forceinline void do_cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1278 template<int FILTER_LENGTH>
1279 struct FilterAllRightMargin_V1<FILTER_LENGTH,0>
1281 template<int FILTER_TAIL>
1282 static __forceinline void do_cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1287 /****
1288 * Equivalent:
1289 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 );
1290 * See @xy_filter_c
1291 * Constrain:
1292 * FILTER_LENGTH<=width && width%4==0
1294 template<int FILTER_LENGTH>
1295 void xy_filter_sse_template_v1(float *dst, int width, int height, int stride, const float *filter)
1297 ASSERT( stride>=4*(width+FILTER_LENGTH) );
1298 ASSERT( ((stride|(4*width)|(4*FILTER_LENGTH)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
1300 M128s_V1<FILTER_LENGTH> filter128s;
1301 filter128s.Load(filter);
1303 const float *filter_start = filter;
1304 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
1305 BYTE* end = dst_byte + height*stride;
1306 for( ; dst_byte<end; dst_byte+=stride )
1308 float *dst2 = reinterpret_cast<float*>(dst_byte);
1310 //left margin
1311 FilterAllLeftMargin_V1<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
1312 float *dst_end1 = dst2 + width;
1313 float *dst_end0 = dst_end1 - ((FILTER_LENGTH+3)&~3);
1314 for (;dst2<dst_end0;dst2+=4)
1316 const float *src = dst2;
1318 //filter 4
1319 __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
1320 src += 4;
1321 __m128 sum = _mm_setzero_ps();
1322 Filter4_V1<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
1323 //store result
1324 _mm_store_ps(dst2, sum);
1326 FilterAllRightMargin_V1<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
1330 /****
1331 * See @xy_filter_c
1333 void xy_filter_sse_v7(float *dst, int width, int height, int stride, const float *filter, int filter_width)
1335 typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
1336 const Filter filters[] = {
1337 NULL,
1338 xy_filter_sse_template_v1<1>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>,
1339 xy_filter_sse_template_v1<5>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>,
1340 xy_filter_sse_template_v1<9>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>,
1341 xy_filter_sse_template_v1<13>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>,
1342 xy_filter_sse_template_v1<17>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>,
1343 xy_filter_sse_template_v1<21>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>,
1344 xy_filter_sse_template_v1<25>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28>
1346 if (filter_width<=28 && filter_width<=width)
1348 int tmp = filter_width;
1349 while( AlmostEqual(filter[tmp-1],0.0f) && filter_width-tmp<3)//Skip tail zero, but limited to 3. We cannot support more current.
1350 tmp--;
1351 filters[tmp](dst, width, height, stride, filter);
1353 else
1355 xy_filter_sse_v6(dst, width, height, stride, filter, filter_width);
1359 // v7 end
1362 // v8
1364 /****
1365 * @src4, @src_5_8, @f3_1, @f3_2, @sum: __m128
1366 * @src4, @src_5_8, @f3_1, @f3_2: const
1367 * @sum: output
1369 #define XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum) \
1370 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
1371 __m128 src_2_5 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
1372 sum = _mm_mul_ps(f3_1, src4);\
1373 __m128 mul2 = _mm_mul_ps(f3_2, src_2_5);\
1374 __m128 mul3 = _mm_mul_ps(f3_1, src_3_6);\
1375 sum = _mm_add_ps(sum, mul2);\
1376 sum = _mm_add_ps(sum, mul3);
1378 /****
1379 * Equivalent:
1380 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, 4 );
1381 * See @xy_filter_c
1382 * Constrain:
1383 * filter[3] == 0 && filter[0] == filter[2] (symmetric) (&& sum(filter)==1)
1385 void xy_3_tag_symmetric_filter_sse(float *dst, int width, int height, int stride, const float *filter)
1387 const int filter_width = 4;
1388 ASSERT( stride>=4*(width+filter_width) );
1389 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
1391 ASSERT(filter_width==4 && filter[3]==0 && filter[2]==filter[0]);
1393 __m128 f3_1 = _mm_set1_ps(filter[0]);
1394 __m128 f3_2 = _mm_set1_ps(filter[1]);
1396 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
1397 BYTE* end = dst_byte + height*stride;
1398 for( ; dst_byte<end; dst_byte+=stride )
1400 float *dst_f = reinterpret_cast<float*>(dst_byte);
1401 float *dst2 = dst_f;
1403 float *dst_end0 = dst_f + width - 4;
1404 //filter 4
1405 __m128 src4 = _mm_load_ps(dst2);/*1 2 3 4*/
1407 __m128 sum;
1408 __m128 src_4 = _mm_setzero_ps();
1409 { XY_3_TAG_SYMMETRIC_FILTER_4(src_4, src4, f3_1, f3_2, sum); }
1410 _mm_store_ps(dst2, sum);
1412 for (;dst2<dst_end0;dst2+=4)
1414 __m128 sum;
1415 __m128 src_5_8 = _mm_load_ps(dst2+4);/*5 6 7 8*/
1416 { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
1417 src4 = src_5_8;
1418 //store result
1419 _mm_store_ps(dst2, sum);
1422 __m128 sum;
1423 __m128 src_5_8 = _mm_setzero_ps();/*5 6 7 8*/
1424 { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
1425 src4 = src_5_8;
1426 //store result
1427 _mm_store_ps(dst2, sum);
1432 void xy_filter_sse_v8(float *dst, int width, int height, int stride, const float *filter, int filter_width)
1434 typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
1435 const Filter filters[] = {
1436 NULL,
1437 xy_filter_sse_template_v1<1>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>,
1438 xy_filter_sse_template_v1<5>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>,
1439 xy_filter_sse_template_v1<9>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>,
1440 xy_filter_sse_template_v1<13>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>,
1441 xy_filter_sse_template_v1<17>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>,
1442 xy_filter_sse_template_v1<21>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>,
1443 xy_filter_sse_template_v1<25>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28>
1445 if (filter_width<=28 && filter_width<=width)
1447 int tmp = filter_width;
1448 // Skip tail zero, but we cannot (and don't have to) support more than 3 tail zeros currently.
1449 while( AlmostEqual(filter[tmp-1],0.0f) && filter_width-tmp<3 )
1450 tmp--;
1451 if (tmp==3&&filter[0]==filter[2])
1453 xy_3_tag_symmetric_filter_sse(dst, width, height, stride, filter);
1455 else
1457 filters[tmp](dst, width, height, stride, filter);
1460 else
1462 xy_filter_sse_v6(dst, width, height, stride, filter, filter_width);
1466 // v8 end
1468 class XyFilterTest : public ::testing::Test
1470 public:
1471 static const int MAX_FILTER_LENGTH = 256;
1472 static const int MAX_DATA_BUFF_SIZE = 512*512;
1473 static const int MAX_BUFF_SIZE = MAX_DATA_BUFF_SIZE + 2*MAX_FILTER_LENGTH;
1475 float *buff_base;
1477 float *filter_f;
1479 float *data;
1480 int w, h, pitch;
1481 int filter_width;
1482 int ex_filter_width;
1484 XyFilterTest():buff_base(NULL),filter_f(NULL),data(NULL)
1486 buff_base = (float*)xy_malloc(MAX_BUFF_SIZE*sizeof(float));
1487 data = buff_base + MAX_FILTER_LENGTH;
1488 filter_f = data + MAX_DATA_BUFF_SIZE;
1490 ~XyFilterTest()
1492 xy_free(buff_base);buff_base=NULL;
1495 const XyFilterTest& copy (const XyFilterTest& rhs)
1497 w = rhs.w;
1498 h = rhs.h;
1499 pitch = rhs.pitch;
1500 filter_width = rhs.filter_width;
1501 ex_filter_width = rhs.ex_filter_width;
1503 memcpy(buff_base, rhs.buff_base, MAX_BUFF_SIZE*sizeof(float));
1504 return *this;
1507 void FillRandData(int w, int h, int filter_width)
1509 this->w = w;
1510 this->h = h;
1511 this->filter_width = filter_width;
1512 this->ex_filter_width = ((filter_width+3)&~3);
1513 this->pitch = ((w+ex_filter_width+3)&~3)*sizeof(float);
1515 ASSERT(this->pitch*this->h <= MAX_DATA_BUFF_SIZE);
1517 for (int i=0;i<h;i++)
1519 float *data2 = (float*)((PUINT8)data + i*pitch);
1520 for (int j=0;j<w+filter_width;j++)
1522 data2[j] = rand()&0xFF;
1525 float sum = 0;
1526 for (int i=0;i<filter_width/2+1;i++)
1528 filter_f[i] = abs((rand()&0xFFFFF)+0xFFFFF);
1529 sum += filter_f[i];
1531 for (int i=filter_width/2+1;i<filter_width;i++)
1533 filter_f[i] = filter_f[filter_width-i-1];
1534 sum += filter_f[i];
1536 for (int i=0;i<filter_width;i++)
1538 filter_f[i] /= sum;
1540 for (int i=filter_width;i<ex_filter_width;i++)
1542 filter_f[i] = 0;
1545 protected:
1546 virtual void SetUp()
1549 private:
1550 XyFilterTest(const XyFilterTest&);
1551 const XyFilterTest& operator= (const XyFilterTest& rhs);
1554 #define FilterTest(width, height, FILTER_LENGTH, loop_num, function) \
1555 TEST_F(XyFilterTest, function ## _ ## width ## _ ## height ## _ ## FILTER_LENGTH ## _ ## loop_num ) \
1557 FillRandData(width, height, FILTER_LENGTH);\
1558 for(int i=0;i<loop_num;i++)\
1560 function(data, w, h, pitch, filter_f, (FILTER_LENGTH+3)&~3);\
1562 /*ASSERT_EQ(data[-filter_width],0)*/;/*anit compiler optimize*/\
1563 ASSERT_EQ(0,0);\
1566 #define DUAL_TEST(width, height, FILTER_LENGTH, loop_num) \
1567 FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v7) \
1568 FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v8)
1570 DUAL_TEST(128, 16, 3, 30000)
1571 DUAL_TEST(256, 16, 3, 30000)
1572 DUAL_TEST(512, 16, 3, 30000)
1573 DUAL_TEST(128, 32, 3, 30000)
1574 DUAL_TEST(256, 32, 3, 30000)
1575 DUAL_TEST(512, 32, 3, 30000)
1576 DUAL_TEST(128, 64, 3, 30000)
1577 DUAL_TEST(256, 64, 3, 30000)
1578 DUAL_TEST(512, 64, 3, 30000)
1579 DUAL_TEST(128, 16, 20, 20000)
1580 DUAL_TEST(256, 16, 20, 20000)
1581 DUAL_TEST(512, 16, 20, 20000)
1582 DUAL_TEST(128, 32, 20, 20000)
1583 DUAL_TEST(256, 32, 20, 20000)
1584 DUAL_TEST(512, 32, 20, 20000)
1585 DUAL_TEST(128, 64, 20, 20000)
1586 DUAL_TEST(256, 64, 20, 20000)
1587 DUAL_TEST(512, 64, 20, 20000)
1588 DUAL_TEST(128, 16, 21, 20000)
1589 DUAL_TEST(256, 16, 21, 20000)
1590 DUAL_TEST(512, 16, 21, 20000)
1591 DUAL_TEST(128, 32, 21, 20000)
1592 DUAL_TEST(256, 32, 21, 20000)
1593 DUAL_TEST(512, 32, 21, 20000)
1594 DUAL_TEST(128, 64, 21, 20000)
1595 DUAL_TEST(256, 64, 21, 20000)
1596 DUAL_TEST(512, 64, 21, 20000)
1598 //////////////////////////////////////////////////////////////////////
1600 // transpose test
1602 void xy_float_2_float_transpose_c_v0(float *dst, int dst_width, int dst_stride,
1603 const float *src, int width, int height, int src_stride)
1605 ASSERT(dst_width >= height);
1606 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1607 const float* src_end = src + width;
1608 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1609 for( ; src<src_end; src++, dst_byte+=dst_stride )
1611 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1613 float *dst2 = reinterpret_cast<float*>(dst_byte);
1614 for (;src2<src2_end;src2+=src_stride,dst2++)
1616 *dst2 = *reinterpret_cast<const float*>(src2);
1618 float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
1619 for (;dst2<dst2_end;dst2++)
1621 *dst2 = 0;
1626 void xy_float_2_float_transpose_sse_v0(float *dst, int dst_width, int dst_stride,
1627 const float *src, int width, int height, int src_stride)
1629 typedef float DstT;
1630 typedef const float SrcT;
1632 ASSERT( (((int)dst|dst_stride)&15)==0 );
1633 ASSERT(dst_width >= height);
1634 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1635 SrcT* src_end = src + width;
1636 PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
1637 PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1638 for( ; src<src_end; src++, dst_byte+=dst_stride )
1640 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1642 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
1643 for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
1645 __m128 m1 = _mm_set_ps(
1646 *(SrcT*)(src2+3*src_stride),
1647 *(SrcT*)(src2+2*src_stride),
1648 *(SrcT*)(src2+src_stride),
1649 *(SrcT*)(src2));
1650 _mm_store_ps(dst2, m1);
1652 for (;src2<src2_end2;src2+=src_stride,dst2++)
1654 *dst2 = *reinterpret_cast<SrcT*>(src2);
1656 float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
1657 for (;dst2<dst2_end;dst2++)
1659 *dst2 = 0;
1664 void float_2_byte_transpose_v0(UINT8 *dst, int dst_width, int dst_stride,
1665 const float *src, int width, int height, int src_stride)
1667 ASSERT(dst_width >= height);
1668 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1669 const float* src_end = src + width;
1670 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1671 for( ; src<src_end; src++, dst_byte+=dst_stride )
1673 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1675 UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
1676 for (;src2<src2_end;src2+=src_stride,dst2++)
1678 *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
1680 UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
1681 for (;dst2<dst2_end;dst2++)
1683 *dst2 = 0;
1688 void float_2_byte_transpose_sse_v0(UINT8 *dst, int dst_width, int dst_stride,
1689 const float *src, int width, int height, int src_stride)
1691 typedef UINT8 DstT;
1692 typedef const float SrcT;
1694 ASSERT(dst_width >= height);
1695 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1696 SrcT* src_end = src + width;
1697 PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
1698 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1699 for( ; src<src_end; src++, dst_byte+=dst_stride )
1701 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1703 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
1704 for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
1706 __m128 m1 = _mm_set_ps(
1707 *(SrcT*)(src2+3*src_stride),
1708 *(SrcT*)(src2+2*src_stride),
1709 *(SrcT*)(src2+src_stride),
1710 *(SrcT*)(src2));
1711 __m128 m2 = _mm_set_ps(
1712 *(SrcT*)(src2+7*src_stride),
1713 *(SrcT*)(src2+6*src_stride),
1714 *(SrcT*)(src2+5*src_stride),
1715 *(SrcT*)(src2+4*src_stride));
1716 __m128 m3 = _mm_set_ps(
1717 *(SrcT*)(src2+11*src_stride),
1718 *(SrcT*)(src2+10*src_stride),
1719 *(SrcT*)(src2+9*src_stride),
1720 *(SrcT*)(src2+8*src_stride));
1721 __m128 m4 = _mm_set_ps(
1722 *(SrcT*)(src2+15*src_stride),
1723 *(SrcT*)(src2+14*src_stride),
1724 *(SrcT*)(src2+13*src_stride),
1725 *(SrcT*)(src2+12*src_stride));
1727 __m128i i1 = _mm_cvtps_epi32(m1);
1728 __m128i i2 = _mm_cvtps_epi32(m2);
1729 __m128i i3 = _mm_cvtps_epi32(m3);
1730 __m128i i4 = _mm_cvtps_epi32(m4);
1732 i1 = _mm_packs_epi32(i1,i2);
1733 i3 = _mm_packs_epi32(i3,i4);
1734 i1 = _mm_packus_epi16(i1,i3);
1736 _mm_store_si128((__m128i*)dst2, i1);
1738 for (;src2<src2_end;src2+=src_stride,dst2++)
1740 *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
1742 DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
1743 for (;dst2<dst2_end;dst2++)
1745 *dst2 = 0;
1750 void byte_2_float_c_v0(float *dst, int dst_width, int dst_stride,
1751 PCUINT8 src, int width, int height, int stride)
1753 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1755 PCUINT8 src_end = src + height*stride;
1756 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
1758 PCUINT8 src2 = src;
1759 PCUINT8 src_end = src2 + width;
1760 float *dst2 = reinterpret_cast<float*>(dst_byte);
1762 for (;src2<src_end;src2++, dst2++)
1764 *dst2 = *src2;
1766 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
1767 for (;dst2<dst2_end;dst2++)
1769 *dst2=0;
1774 void byte_2_float_sse_v0(float *dst, int dst_width, int dst_stride,
1775 PCUINT8 src, int width, int height, int stride)
1777 ASSERT( dst_width>=width );
1778 ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
1779 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1781 PCUINT8 src_end = src + height*stride;
1782 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
1784 PCUINT8 src2 = src;
1785 PCUINT8 src2_end0 = src2 + (width&~15);
1786 float *dst2 = reinterpret_cast<float*>(dst_byte);
1788 for (;src2<src2_end0;src2+=16, dst2+=16)
1790 //filter 4
1791 __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
1792 __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
1793 __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
1794 __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
1795 __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
1796 __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
1797 __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
1798 __m128 dst_f1 = _mm_cvtepi32_ps(src16_lo_lo);
1799 __m128 dst_f2 = _mm_cvtepi32_ps(src16_lo_hi);
1800 __m128 dst_f3 = _mm_cvtepi32_ps(src16_hi_lo);
1801 __m128 dst_f4 = _mm_cvtepi32_ps(src16_hi_hi);
1802 _mm_stream_ps(dst2, dst_f1);
1803 _mm_stream_ps(dst2+4, dst_f2);
1804 _mm_stream_ps(dst2+8, dst_f3);
1805 _mm_stream_ps(dst2+12, dst_f4);
1807 PCUINT8 src2_end = src + width;
1808 for (;src2<src2_end;src2++,dst2++)
1810 *dst2 = *src2;
1812 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
1813 for (;dst2<dst2_end;dst2++)
1815 *dst2=0;
1820 void byte_2_float_sse_v1(float *dst, int dst_width, int dst_stride,
1821 PCUINT8 src, int width, int height, int stride)
1823 ASSERT( dst_width>=width );
1824 ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
1825 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1827 PCUINT8 src_end = src + height*stride;
1828 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
1830 PCUINT8 src2 = src;
1831 PCUINT8 src2_end0 = src2 + (width&~15);
1832 float *dst2 = reinterpret_cast<float*>(dst_byte);
1834 for (;src2<src2_end0;src2+=16, dst2+=16)
1836 //filter 4
1837 __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
1838 __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
1839 __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
1840 __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
1841 __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
1842 __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
1843 __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
1844 __m128 dst_f1 = _mm_cvtepi32_ps(src16_lo_lo);
1845 __m128 dst_f2 = _mm_cvtepi32_ps(src16_lo_hi);
1846 __m128 dst_f3 = _mm_cvtepi32_ps(src16_hi_lo);
1847 __m128 dst_f4 = _mm_cvtepi32_ps(src16_hi_hi);
1848 _mm_store_ps(dst2, dst_f1);
1849 _mm_store_ps(dst2+4, dst_f2);
1850 _mm_store_ps(dst2+8, dst_f3);
1851 _mm_store_ps(dst2+12, dst_f4);
1853 PCUINT8 src2_end = src + width;
1854 for (;src2<src2_end;src2++,dst2++)
1856 *dst2 = *src2;
1858 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
1859 for (;dst2<dst2_end;dst2++)
1861 *dst2=0;
1866 class XyTransposeTest : public ::testing::Test
1868 public:
1869 static const int MAX_FILTER_LENGTH = 128;
1870 static const int MAX_DATA_BUFF_SIZE = 512*128;
1872 UINT8 data_byte_base[MAX_DATA_BUFF_SIZE+15];
1873 UINT8 *data_b;
1875 float data_base_f[2*MAX_DATA_BUFF_SIZE+3];
1876 float *data_f;
1877 float *data_f2;
1878 int w, h, pitch;
1880 protected:
1881 virtual void SetUp()
1885 void FillRandData(int w, int h)
1887 this->w = w;
1888 this->h = h;
1889 ASSERT(w%4==0);
1891 this->pitch = ((w+3)&~3)*sizeof(float);
1892 ASSERT(this->pitch*this->h <= MAX_DATA_BUFF_SIZE);
1894 data_f = data_base_f;
1895 if ((int)(data_f)&15)
1897 data_f = (float*)(((int)data_f + 15) & ~15);
1899 for (int i=0;i<h;i++)
1901 float *data2 = (float*)((PUINT8)data_f + i*pitch);
1902 for (int j=0;j<w;j++)
1904 data2[j] = rand()*0.1f/0xFF;
1907 data_f2 = data_f + h*pitch;
1909 data_b = data_byte_base;
1910 if ((int)(data_b)&15)
1912 data_b = (UINT8*)(((int)data_b + 15) & ~15);
1918 #define Float2FloatTest(width, height, loop_num, function) \
1919 TEST_F(XyTransposeTest, function ## _ ## width ## _ ## height ## _ ## loop_num ) \
1921 FillRandData(width, height);\
1922 for(int i=0;i<loop_num;i++)\
1924 function(data_f2, height, (width+15)&~15, data_f, width, height, pitch);\
1926 ASSERT_EQ(data_f2[0],data_f2[0]);\
1929 Float2FloatTest( 272, 80, 10000, xy_float_2_float_transpose_c_v0)
1930 Float2FloatTest( 272, 80, 10000, xy_float_2_float_transpose_sse_v0)
1933 #define Float2ByteTest(width, height, loop_num, function) \
1934 TEST_F(XyTransposeTest, function ## _ ## width ## _ ## height ## _ ## loop_num ) \
1936 FillRandData(width, height);\
1937 for(int i=0;i<loop_num;i++)\
1939 function(data_b, height, (width+15)&~15, data_f, width, height, pitch);\
1941 ASSERT_EQ(data_b[0],data_b[0]);\
1944 Float2ByteTest( 272, 80, 10000, float_2_byte_transpose_v0)
1945 Float2ByteTest( 272, 80, 10000, float_2_byte_transpose_sse_v0)
1948 #define Byte2FloatTest(dst_width, width, height, loop_num, function) \
1949 TEST_F(XyTransposeTest, function ## _ ## width ## _ ## height ## _ ## loop_num ) \
1951 FillRandData(width, height);\
1952 for(int i=0;i<loop_num;i++)\
1954 function(data_f, width, (width+15)&~15, data_b, width, height, pitch);\
1956 ASSERT_EQ(data_b[0],data_b[0]);\
1960 Byte2FloatTest( 272, 256, 64, 10000, byte_2_float_c_v0)
1961 Byte2FloatTest( 272, 256, 64, 10000, byte_2_float_sse_v0)
1962 Byte2FloatTest( 272, 256, 64, 10000, byte_2_float_sse_v1)
1964 #endif // __XY_FILTER_BENCHMARK_F0ED9F75_C0A7_4B0E_910E_7A82ECE2D7AD_H__