3 typedef const UINT8 CUINT8
, *PCUINT8
;
8 __forceinline
void xy_filter_one_line_c(float *dst
, int width
, const float *filter
, int filter_width
)
10 const float *filter_start
= filter
;
11 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
12 float *dst2
= dst
- filter_width
;
13 float *dst_endr
= dst
+ width
;
14 float *dst_end0
= dst_endr
- filter_width
;
15 float *dst_endl
= dst
- xx_fix
;
16 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
18 ASSERT(filter_start
== filter
);
19 filter_start
+= filter_width
;
20 const float *filter_end
= filter_start
;
21 for (;dst2
<dst_endl
;dst2
++, filter_start
--)//left margin
23 const float *src
= dst
;
25 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
31 for (;dst2
<dst
;dst2
++, filter_start
--, filter_end
--)//if width < filter_width
33 const float *src
= dst
;
35 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
41 ASSERT(filter_start
==filter
);
42 for (;dst2
<dst_end0
;dst2
++)
44 const float *src
= dst2
;
47 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
53 for (;dst2
<dst_endr
;dst2
++, filter_end
--)//right margin
55 const float *src
= dst2
;
57 for(const float* f
=filter
;f
<filter_end
;f
++, src
++)
67 * 1. Source content starts from @dst;
68 * 2. Output content starts from @dst-@mwitdth;
71 * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
73 * |- margin -|- src width*height items -|
74 * |- mwidth*heigth items -|- -|
75 * |- do NOT need to init -|- -|
76 * |- when input -|- -|
77 * |---------------------------------------------------------|
79 void xy_filter_c(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
81 ASSERT( stride
>=4*(width
+filter_width
) );
82 BYTE
* end
= reinterpret_cast<BYTE
*>(dst
) + height
*stride
;
83 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
84 for( ; dst_byte
<end
; dst_byte
+=stride
)
86 xy_filter_one_line_c(reinterpret_cast<float*>(dst_byte
), width
, filter
, filter_width
);
91 * inline function sometimes generates stupid code
93 * @src4, @src_5_8, @f4, @sum : __m128
94 * @src_5_8, @f4: const
98 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
99 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
100 f4_1 = _mm_mul_ps(f4_1, src4);\
101 sum = _mm_add_ps(sum, f4_1);\
102 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
103 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
104 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
105 sum = _mm_add_ps(sum, f4_1);\
106 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
107 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
108 f4_1 = _mm_mul_ps(f4_1, src4);\
109 sum = _mm_add_ps(sum, f4_1);\
110 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
111 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
112 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
113 sum = _mm_add_ps(sum, f4_1)
116 __forceinline
void xy_filter_one_line_sse_v4(float *dst
, int width
, const float *filter
, int filter_width
)
118 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
119 const float *filter_start
= filter
;
120 float *dst2
= dst
- filter_width
;
121 float *dst_endr
= dst
+ width
;
122 float *dst_end0
= dst_endr
- filter_width
;
123 float *dst_endl
= dst
- xx_fix
;
124 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
126 ASSERT(filter_start
== filter
);
127 filter_start
+= filter_width
;
128 const float *filter_end
= filter_start
;
130 for (;dst2
<dst_endl
;dst2
+=4)//left margin
132 const float *src
= dst
;
136 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
137 __m128 sum
= _mm_setzero_ps();
138 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
140 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
141 __m128 f4
= _mm_load_ps(f
);
143 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
148 _mm_store_ps(dst2
, sum
);
150 for (;dst2
<dst
;dst2
+=4)//if width < filter_width
152 const float *src
= dst
;
156 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
157 __m128 sum
= _mm_setzero_ps();
159 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
161 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
164 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
167 src_5_8
= _mm_setzero_ps();
168 f4
= _mm_load_ps(filter_end
);
169 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
171 _mm_store_ps(dst2
, sum
);
173 ASSERT(filter_start
== filter
);
174 for (;dst2
<dst_end0
;dst2
+=4)
176 const float *src
= dst2
;
179 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
180 __m128 sum
= _mm_setzero_ps();
181 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
184 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
185 __m128 f4
= _mm_load_ps(f
);
187 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
191 _mm_store_ps(dst2
, sum
);
193 for (;dst2
<dst_endr
;dst2
+=4)//right margin
195 const float *src
= dst2
;
199 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
200 __m128 sum
= _mm_setzero_ps();
202 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
205 src_5_8
= _mm_load_ps(src
);//5 6 7 8
208 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
211 //move new 4 in_n_out to old 4 in_n_out
213 src_5_8
= _mm_setzero_ps();
214 f4
= _mm_load_ps(filter_end
);
215 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
217 _mm_store_ps(dst2
, sum
);
224 void xy_filter_sse_v4(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
226 ASSERT( stride
>=4*(width
+filter_width
) );
227 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
229 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
230 BYTE
* end
= dst_byte
+ height
*stride
;
231 for( ; dst_byte
<end
; dst_byte
+=stride
)
233 xy_filter_one_line_sse_v4(reinterpret_cast<float*>(dst_byte
), width
, filter
, filter_width
);
241 M128s
<LENGTH
- 4> next
;
243 template<int Index
> __forceinline __m128
& GetAt()
245 return next
.GetAt
<Index
- 4>();
247 template<> __forceinline __m128
& GetAt
<0>()
252 template<int Start
, int Offset
> __forceinline __m128
& GetAt()
254 return GetAt
<Start
+ Offset
>();
257 __forceinline
void Load(const float* src
)
259 x
= _mm_load_ps(src
);
267 void Load(const float* src
)
272 template<int FILTER_LENGTH
, int START
, int LENGTH
>
275 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
277 Filter4
<FILTER_LENGTH
,START
,LENGTH
-4>::do_cal(src0_128
, src4
, filter128s
, sum
);
278 __m128 src4_128
= _mm_load_ps(src4
+LENGTH
-4);
279 XY_FILTER_4(src0_128
, src4_128
, filter128s
.GetAt
<START
+LENGTH
-4>(), sum
);
284 template<int FILTER_LENGTH
, int START
>
285 struct Filter4
<FILTER_LENGTH
, START
, 0>
287 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
292 template<int FILTER_LENGTH
,int MARGIN_LENGTH
>
293 struct FilterAllMargin
295 static __forceinline
void cal_left_margin(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
298 __m128 src0
= _mm_setzero_ps();
299 __m128 sum
= _mm_setzero_ps();
300 Filter4
<FILTER_LENGTH
,MARGIN_LENGTH
-4,FILTER_LENGTH
-MARGIN_LENGTH
+4>::do_cal(src0
, src
, filter128s
, sum
);
301 _mm_store_ps(src
-MARGIN_LENGTH
, sum
);
302 FilterAllMargin
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::cal_left_margin(src
, filter128s
);
305 static __forceinline
void cal_right_margin(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
309 __m128 src0
= _mm_load_ps(src
);
310 __m128 sum
= _mm_setzero_ps();
311 Filter4
<FILTER_LENGTH
,0,MARGIN_LENGTH
-4>::do_cal(src0
, src
+4, filter128s
, sum
);
312 __m128 src4
= _mm_setzero_ps();
313 XY_FILTER_4(src0
, src4
, filter128s
.GetAt
<MARGIN_LENGTH
-4>(), sum
);
315 _mm_store_ps(src
, sum
);
317 FilterAllMargin
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::cal_right_margin(src
+4, filter128s
);
321 template<int FILTER_LENGTH
>
322 struct FilterAllMargin
<FILTER_LENGTH
,0>
324 static __forceinline
void cal_left_margin(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
327 static __forceinline
void cal_right_margin(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
335 * FILTER_LENGTH%4 == 0 && FILTER_LENGTH<=width
337 template<int FILTER_LENGTH
>
338 void xy_filter_sse_template(float *dst
, int width
, int height
, int stride
, const float *filter
)
340 ASSERT( stride
>=4*(width
+FILTER_LENGTH
) );
341 ASSERT( ((stride
|(4*width
)|(4*FILTER_LENGTH
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
343 M128s
<FILTER_LENGTH
> filter128s
;
344 filter128s
.Load(filter
);
346 const float *filter_start
= filter
;
347 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
348 BYTE
* end
= dst_byte
+ height
*stride
;
349 for( ; dst_byte
<end
; dst_byte
+=stride
)
351 float *dst2
= reinterpret_cast<float*>(dst_byte
);
354 FilterAllMargin
<FILTER_LENGTH
,FILTER_LENGTH
>::cal_left_margin(dst2
, filter128s
);
355 float *dst_end1
= dst2
+ width
;
356 float *dst_end0
= dst_end1
- FILTER_LENGTH
;
357 for (;dst2
<dst_end0
;dst2
+=4)
359 const float *src
= dst2
;
362 __m128 src0
= _mm_load_ps(src
);/*1 2 3 4*/
364 __m128 sum
= _mm_setzero_ps();
365 Filter4
<FILTER_LENGTH
,0,FILTER_LENGTH
>::do_cal(src0
, src
, filter128s
, sum
);
367 _mm_store_ps(dst2
, sum
);
369 FilterAllMargin
<FILTER_LENGTH
,FILTER_LENGTH
>::cal_right_margin(dst2
, filter128s
);
376 void xy_filter_sse(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
378 typedef void (*Filter
)(float *dst
, int width
, int height
, int stride
, const float *filter
);
379 const Filter filters
[] = { xy_filter_sse_template
<4>, xy_filter_sse_template
<8>, xy_filter_sse_template
<12>, xy_filter_sse_template
<16>,
380 xy_filter_sse_template
<20>, xy_filter_sse_template
<24>, xy_filter_sse_template
<28>
382 if (filter_width
<=28 && filter_width
<=width
)
384 ASSERT(filter_width
%4==0);
385 filters
[(filter_width
-1)/4](dst
, width
, height
, stride
, filter
);
389 xy_filter_sse_v4(dst
, width
, height
, stride
, filter
, filter_width
);
394 * Copy and convert src to dst line by line.
395 * @dst_width MUST >= @width
396 * if @dst_width>@width, the extra elements will be filled with 0.
398 void xy_byte_2_float_c(float *dst
, int dst_width
, int dst_stride
,
399 PCUINT8 src
, int width
, int height
, int stride
)
401 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
403 PCUINT8 src_end
= src
+ height
*stride
;
404 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
407 PCUINT8 src_end
= src2
+ width
;
408 float *dst2
= reinterpret_cast<float*>(dst_byte
);
410 for (;src2
<src_end
;src2
++, dst2
++)
414 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
415 for (;dst2
<dst2_end
;dst2
++)
423 * See @xy_byte_2_float_c
425 void xy_byte_2_float_sse(float *dst
, int dst_width
, int dst_stride
,
426 PCUINT8 src
, int width
, int height
, int stride
)
428 ASSERT( dst_width
>=width
);
429 ASSERT( ((reinterpret_cast<int>(dst
)|dst_stride
)&15)==0 );
430 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
432 PCUINT8 src_end
= src
+ height
*stride
;
433 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
436 PCUINT8 src2_end0
= src2
+ (width
&~15);
437 float *dst2
= reinterpret_cast<float*>(dst_byte
);
439 for (;src2
<src2_end0
;src2
+=16, dst2
+=16)
442 __m128i src16
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(src2
));
443 __m128i src16_lo
= _mm_unpacklo_epi8(src16
, _mm_setzero_si128());
444 __m128i src16_hi
= _mm_unpackhi_epi8(src16
, _mm_setzero_si128());
445 __m128i src16_lo_lo
= _mm_unpacklo_epi8(src16_lo
, _mm_setzero_si128());
446 __m128i src16_lo_hi
= _mm_unpackhi_epi8(src16_lo
, _mm_setzero_si128());
447 __m128i src16_hi_lo
= _mm_unpacklo_epi8(src16_hi
, _mm_setzero_si128());
448 __m128i src16_hi_hi
= _mm_unpackhi_epi8(src16_hi
, _mm_setzero_si128());
449 __m128 dst_f1
= _mm_cvtepi32_ps(src16_lo_lo
);
450 __m128 dst_f2
= _mm_cvtepi32_ps(src16_lo_hi
);
451 __m128 dst_f3
= _mm_cvtepi32_ps(src16_hi_lo
);
452 __m128 dst_f4
= _mm_cvtepi32_ps(src16_hi_hi
);
453 _mm_store_ps(dst2
, dst_f1
);
454 _mm_store_ps(dst2
+4, dst_f2
);
455 _mm_store_ps(dst2
+8, dst_f3
);
456 _mm_store_ps(dst2
+12, dst_f4
);
458 PCUINT8 src2_end
= src
+ width
;
459 for (;src2
<src2_end
;src2
++,dst2
++)
463 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
464 for (;dst2
<dst2_end
;dst2
++)
472 * Copy transposed Matrix src to dst.
473 * @dst_width MUST >= @height.
474 * if @dst_width > @height, the extra elements will be filled with 0.
476 void xy_float_2_float_transpose_c(float *dst
, int dst_width
, int dst_stride
,
477 const float *src
, int width
, int height
, int src_stride
)
479 ASSERT(dst_width
>= height
);
480 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
481 const float* src_end
= src
+ width
;
482 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
483 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
485 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
487 float *dst2
= reinterpret_cast<float*>(dst_byte
);
488 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
490 *dst2
= *reinterpret_cast<const float*>(src2
);
492 float *dst2_end
= reinterpret_cast<float*>(dst_byte
) + dst_width
;
493 for (;dst2
<dst2_end
;dst2
++)
501 * see @xy_float_2_float_transpose_c
503 void xy_float_2_float_transpose_sse(float *dst
, int dst_width
, int dst_stride
,
504 const float *src
, int width
, int height
, int src_stride
)
507 typedef const float SrcT
;
509 ASSERT( (((int)dst
|dst_stride
)&15)==0 );
510 ASSERT(dst_width
>= height
);
511 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
512 SrcT
* src_end
= src
+ width
;
513 PCUINT8 src2_end1
= reinterpret_cast<PCUINT8
>(src
) + (height
&~3)*src_stride
;
514 PCUINT8 src2_end2
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
515 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
517 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
519 DstT
*dst2
= reinterpret_cast<DstT
*>(dst_byte
);
520 for (;src2
<src2_end1
;src2
+=4*src_stride
,dst2
+=4)
522 __m128 m1
= _mm_set_ps(
523 *(SrcT
*)(src2
+3*src_stride
),
524 *(SrcT
*)(src2
+2*src_stride
),
525 *(SrcT
*)(src2
+src_stride
),
527 _mm_store_ps(dst2
, m1
);
529 for (;src2
<src2_end2
;src2
+=src_stride
,dst2
++)
531 *dst2
= *reinterpret_cast<SrcT
*>(src2
);
533 float *dst2_end
= reinterpret_cast<DstT
*>(dst_byte
) + dst_width
;
534 for (;dst2
<dst2_end
;dst2
++)
542 * Transpose and round Matrix src, then copy to dst.
543 * @dst_width MUST >= @height.
544 * if @dst_width > @height, the extra elements will be filled with 0.
546 void xy_float_2_byte_transpose_c(UINT8
*dst
, int dst_width
, int dst_stride
,
547 const float *src
, int width
, int height
, int src_stride
)
549 ASSERT(dst_width
>= height
);
550 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
551 const float* src_end
= src
+ width
;
552 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
553 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
555 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
557 UINT8
*dst2
= reinterpret_cast<UINT8
*>(dst_byte
);
558 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
560 *dst2
= static_cast<UINT8
>(*reinterpret_cast<const float*>(src2
)+0.5);
562 UINT8
*dst2_end
= reinterpret_cast<UINT8
*>(dst_byte
) + dst_width
;
563 for (;dst2
<dst2_end
;dst2
++)
570 void xy_float_2_byte_transpose_sse(UINT8
*dst
, int dst_width
, int dst_stride
,
571 const float *src
, int width
, int height
, int src_stride
)
574 typedef const float SrcT
;
576 ASSERT(dst_width
>= height
);
577 ASSERT((((int)dst
|dst_stride
)&15)==0);
578 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
579 SrcT
* src_end
= src
+ width
;
580 PCUINT8 src2_end00
= reinterpret_cast<PCUINT8
>(src
) + (height
&~15)*src_stride
;
581 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
582 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
584 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
586 DstT
*dst2
= reinterpret_cast<DstT
*>(dst_byte
);
587 for (;src2
<src2_end00
;src2
+=16*src_stride
,dst2
+=16)
589 __m128 m1
= _mm_set_ps(
590 *(SrcT
*)(src2
+3*src_stride
),
591 *(SrcT
*)(src2
+2*src_stride
),
592 *(SrcT
*)(src2
+src_stride
),
594 __m128 m2
= _mm_set_ps(
595 *(SrcT
*)(src2
+7*src_stride
),
596 *(SrcT
*)(src2
+6*src_stride
),
597 *(SrcT
*)(src2
+5*src_stride
),
598 *(SrcT
*)(src2
+4*src_stride
));
599 __m128 m3
= _mm_set_ps(
600 *(SrcT
*)(src2
+11*src_stride
),
601 *(SrcT
*)(src2
+10*src_stride
),
602 *(SrcT
*)(src2
+9*src_stride
),
603 *(SrcT
*)(src2
+8*src_stride
));
604 __m128 m4
= _mm_set_ps(
605 *(SrcT
*)(src2
+15*src_stride
),
606 *(SrcT
*)(src2
+14*src_stride
),
607 *(SrcT
*)(src2
+13*src_stride
),
608 *(SrcT
*)(src2
+12*src_stride
));
610 __m128i i1
= _mm_cvtps_epi32(m1
);
611 __m128i i2
= _mm_cvtps_epi32(m2
);
612 __m128i i3
= _mm_cvtps_epi32(m3
);
613 __m128i i4
= _mm_cvtps_epi32(m4
);
615 i1
= _mm_packs_epi32(i1
,i2
);
616 i3
= _mm_packs_epi32(i3
,i4
);
617 i1
= _mm_packus_epi16(i1
,i3
);
619 _mm_store_si128((__m128i
*)dst2
, i1
);
621 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
623 *dst2
= static_cast<DstT
>(*reinterpret_cast<SrcT
*>(src2
)+0.5);
625 DstT
*dst2_end
= reinterpret_cast<DstT
*>(dst_byte
) + dst_width
;
626 for (;dst2
<dst2_end
;dst2
++)
634 * To Do: decent CPU capability check
636 void xy_gaussian_blur(PUINT8 dst
, int dst_stride
,
637 PCUINT8 src
, int width
, int height
, int stride
,
638 const float *gt
, int r
, int gt_ex_width
)
640 ASSERT(width
<=stride
&& width
+2*r
<=dst_stride
);
641 int ex_mask_width
= ((r
*2+1)+3)&~3;
642 ASSERT(ex_mask_width
<=gt_ex_width
);
643 if (ex_mask_width
>gt_ex_width
)
650 int fwidth
= (width
+3)&~3;
651 int fstride
= (fwidth
+ ex_mask_width
)*sizeof(float);
652 int fheight
= (height
+3)&~3;
653 int fstride_ver
= (fheight
+ex_mask_width
)*sizeof(float);
655 PUINT8 buff_base
= reinterpret_cast<PUINT8
>(xy_malloc(height
*fstride
+ (fwidth
+ ex_mask_width
)*fstride_ver
));
657 float *hor_buff_base
= reinterpret_cast<float*>(buff_base
);
658 float *hor_buff
= hor_buff_base
+ ex_mask_width
;
661 ASSERT( ((width
+15)&~15)<=stride
);
662 xy_byte_2_float_sse(hor_buff
, fwidth
, fstride
, src
, width
, height
, stride
);
665 xy_filter_sse(hor_buff
, fwidth
, height
, fstride
, gt
, ex_mask_width
);
669 float *ver_buff_base
= reinterpret_cast<float*>(buff_base
+ height
*fstride
);
670 float *ver_buff
= ver_buff_base
+ ex_mask_width
;
672 int true_width
= width
+r
*2;
673 xy_float_2_float_transpose_sse(ver_buff
, fheight
, fstride_ver
, hor_buff
-r
*2, true_width
, height
, fstride
);
676 xy_filter_sse(ver_buff
, fheight
, true_width
, fstride_ver
, gt
, ex_mask_width
);
679 int true_height
= height
+ 2*r
;
680 xy_float_2_byte_transpose_sse(dst
, true_width
, dst_stride
, ver_buff
-r
*2, true_height
, true_width
, fstride_ver
);