* fixed several format string inconsistencies and deprecated C constructions.
[vlc.git] / modules / codec / ffmpeg / postprocessing / postprocessing_mmx.c
blob7c209c4c7bb06cebdc02b8dae2a9fe27ca0f63d0
1 /*****************************************************************************
2 * postprocessing_mmx.c: Post Processing library in MMX
3 *****************************************************************************
4 * Copyright (C) 2001 VideoLAN
5 * $Id: postprocessing_mmx.c,v 1.3 2002/12/18 14:17:10 sam Exp $
7 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 *****************************************************************************/
24 #include <vlc/vlc.h> /* only use uint8_t, uint32_t .... */
26 #include "postprocessing.h"
27 #include "postprocessing_common.h"
29 /*****************************************************************************
31 * Internals functions common to pp_Deblock_V and pp_Deblock_H
33 *****************************************************************************/
35 /*****************************************************************************
36 * MMX stuff
37 *****************************************************************************/
40 /* XXX PP_THR1 need to be defined as ULL */
42 /* Use same things as in idct but how it work ? */
43 #define UNUSED_LONGLONG( foo ) \
44 static const unsigned long long foo __asm__ (#foo) __attribute__((unused))
46 /* to calculate isDC_mode for mmx */
47 UNUSED_LONGLONG( mmx_127_thr1 ) = ( ( 127ULL - PP_THR1 ) << 56 )|
48 ( ( 127ULL - PP_THR1 ) << 48 )|
49 ( ( 127ULL - PP_THR1 ) << 40 )|
50 ( ( 127ULL - PP_THR1 ) << 32 )|
51 ( ( 127ULL - PP_THR1 ) << 24 )|
52 ( ( 127ULL - PP_THR1 ) << 16 )|
53 ( ( 127ULL - PP_THR1 ) << 8 )|
54 ( ( 127ULL - PP_THR1 ) );
56 UNUSED_LONGLONG( mmx_127_2xthr1_1 ) = ( ( 127ULL - PP_2xTHR1 -1) << 56 )|
57 ( ( 127ULL - PP_2xTHR1 -1 ) << 48 )|
58 ( ( 127ULL - PP_2xTHR1 -1 ) << 40 )|
59 ( ( 127ULL - PP_2xTHR1 -1 ) << 32 )|
60 ( ( 127ULL - PP_2xTHR1 -1 ) << 24 )|
61 ( ( 127ULL - PP_2xTHR1 -1 ) << 16 )|
62 ( ( 127ULL - PP_2xTHR1 -1 ) << 8 )|
63 ( ( 127ULL - PP_2xTHR1 -1 ) );
65 UNUSED_LONGLONG( mmx_m2_5_m5_2 ) = 0xfffe0005fffb0002ULL;
67 #if 0
68 /* find min bytes from r ans set it in r, t is destroyed */
69 #define MMXEXT_GET_PMIN( r, t ) \
70 "movq " #r ", " #t " \n" \
71 "psrlq $8, " #t " \n" \
72 "pminub " #t ", " #r " \n" \
73 "pshufw $0xf5, " #r ", " #t " #instead of shift with tmp reg \n" \
74 "pminub " #t ", " #r " \n" \
75 "pshufw $0xfe, " #r ", " #t " \n" \
76 "pminub " #t ", " #r " \n"
78 /* find mzx bytes from r ans set it in r, t is destroyed */
79 #define MMXEXT_GET_PMAX( r, t ) \
80 "movq " #r ", " #t " \n" \
81 "psrlq $8, " #t " \n" \
82 "pmaxub " #t ", " #r " \n" \
83 "pshufw $0xf5, " #r ", " #t " \n" \
84 "pmaxub " #t ", " #r " \n" \
85 "pshufw $0xfe, " #r ", " #t " \n" \
86 "pmaxub " #t ", " #r " \n"
90 #define MMXEXT_GET_LMINMAX( s, m, M, t ) \
91 "movq " #s ", " #t " \n" \
92 "pminub " #t ", " #m " \n" \
93 "pmaxub " #t ", " #M " \n"
96 /* Some tips for MMX
98 * |a-b| :
99 d1 = a - b with unsigned saturate
100 d2 = b - a with ...
101 |a-b| = d1 | d2
105 #endif
106 /****************************************************************************
107 * pp_deblock_isDC_mode : Check if we will use DC mode or Default mode
108 ****************************************************************************
109 * Use constant PP_THR1 and PP_THR2 ( PP_2xTHR1 )
111 * Called for for each pixel on a boundary block when doing deblocking
112 * so need to be fast ...
114 ****************************************************************************/
115 static inline int pp_deblock_isDC_mode( uint8_t *p_v )
117 unsigned int i_eq_cnt;
119 /* algo :
120 x = v[i] - v[i+1] without signed saturation
121 ( XXX see if there is'nt problem, but can't be with signed
122 sat because pixel will be saturate :(
123 so x within [-128, 127] and we have to test if it fit in [-M, M]
124 we add 127-M with wrap around -> good value fit in [ 127-2*M, 127]
125 and if x >= 127 - 2 * M ie x > 127 -2*M - 1 value is good
127 __asm__ __volatile__ (
128 " #* Do (v0-v1) to (v7-v8) \n"
129 "movq (%1), %%mm1 # load v0->v7 \n"
130 "movq 1(%1), %%mm2 # load v1->v8 \n"
131 "psubb %%mm2, %%mm1 # v[i]-v[i+1] \n"
132 "paddb mmx_127_thr1, %%mm1 # + 127-THR1 with wrap \n"
133 "pcmpgtb mmx_127_2xthr1_1, %%mm1 # > 127 -2*thr1 - 1 \n"
134 "movq %%mm1, %%mm0 # \n"
135 "psrlw $8, %%mm1 # \n"
136 "paddb %%mm1, %%mm0 # \n"
137 " # \n"
138 "movq %%mm0, %%mm1 # Now sum to create eq_cnt \n"
139 "psrld $16, %%mm0 # \n"
140 "paddb %%mm0, %%mm1 # \n"
141 " # \n"
142 "movq %%mm1, %%mm0 # \n"
143 "psrlq $32, %%mm1 # \n"
144 "paddb %%mm1, %%mm0 \n"
145 "movd %%mm0, %0 # \n"
146 "negl %0 \n"
147 "andl $255, %0"
149 : "=r"(i_eq_cnt) : "r" (p_v) );
151 /* last test, hey, 9 don't fit in MMX */
152 if(( ( p_v[8] - p_v[9] + PP_THR1 )&0xffff )<= PP_2xTHR1 )
154 i_eq_cnt++;
157 #if 0
158 /* algo : if ( | v[i] -v[i+1] | <= PP_THR1 ) { i_eq_cnt++; } */
159 i_eq_cnt = 0;
161 for( i =0; i < 9; i++ )
163 if(( ( p_v[i] - p_v[i+1] + PP_THR1 )&0xffff )<= PP_2xTHR1 )
165 i_eq_cnt++;
168 #endif
170 return( (i_eq_cnt >= PP_THR2 ) ? 1 : 0 );
173 static inline int pp_deblock_isMinMaxOk( uint8_t *p_v, int i_QP )
175 int i_range;
177 #if 0
178 __asm__ __volatile__ (
179 "movq 1(%1), %%mm0 # 8 bytes \n"
180 "movq %%mm0, %%mm1 \n"
181 MMXEXT_GET_PMIN( %%mm0, %%mm7 )
182 MMXEXT_GET_PMAX( %%mm1, %%mm7 )
183 "psubd %%mm0, %%mm1 # max - min \n"
184 "movd %%mm1, %0 \n"
185 "andl $255, %0" : "=r"(i_range) : "r"(p_v) );
187 #endif
188 int i_max, i_min;
189 int i;
191 i_min = i_max = p_v[1];
192 for( i = 2; i < 9; i++ )
194 if( i_max < p_v[i] ) i_max = p_v[i];
195 if( i_min > p_v[i] ) i_min = p_v[i];
197 i_range = i_max - i_min;
199 return( i_range< 2*i_QP ? 1 : 0 );
203 static inline void pp_deblock_DefaultMode( uint8_t i_v[10], int i_stride,
204 int i_QP )
206 int d, i_delta;
207 int a3x0, a3x0_, a3x1, a3x2;
208 int b_neg;
210 /* d = CLIP( 5(a3x0' - a3x0)//8, 0, (v4-v5)/2 ).d( abs(a3x0) < QP ) */
212 /* First calculate a3x0 */
213 __asm__ __volatile__ (
214 "pxor %%mm7, %%mm7 # mm7 = 0 \n"
215 "movq mmx_m2_5_m5_2, %%mm6 # mm6 =(2,-5,5,-2) \n"
216 "movd 3(%1), %%mm0 \n"
217 "punpcklbw %%mm7,%%mm0 \n"
218 "pmaddwd %%mm6, %%mm0 \n"
219 "movq %%mm0, %%mm1 \n"
220 "psrlq $32, %%mm1 \n"
221 "paddd %%mm1, %%mm0 \n"
222 "movd %%mm0, %0" : "=r"(a3x0) :"r"(i_v) );
223 #if 0
224 a3x0 = 2 * ( i_v[3] - i_v[6] ) + 5 *( i_v[5] - i_v[4] );
225 #endif
227 if( a3x0 < 0 )
229 b_neg = 1;
230 a3x0 = -a3x0;
232 else
234 b_neg = 0;
236 /* XXX Now a3x0 is abs( a3x0 ) */
237 if( ( a3x0 < 8 * i_QP )&&( a3x0 != 0 ) ) /* |a3x0| < 8*i_QP */
239 /* calculate a3x1 et a3x2 */
240 __asm__ __volatile__ (
241 " # mm7 = 0 \n"
242 " # mm6 = ( 2, -5, 5, -2 ) \n"
243 "movd 1(%2), %%mm0 \n"
244 "movd 5(%2), %%mm2 \n"
245 "punpcklbw %%mm7,%%mm0 \n"
246 "punpcklbw %%mm7,%%mm2 \n"
247 "pmaddwd %%mm6, %%mm0 \n"
248 "pmaddwd %%mm6, %%mm2 \n"
249 "movq %%mm0, %%mm1 \n"
250 "psrlq $32, %%mm1 \n"
251 "paddd %%mm1, %%mm0 # mm0 = a3x1 \n"
252 "movd %%mm0, %0 \n"
253 "movq %%mm2, %%mm1 \n"
254 "psrlq $32, %%mm1 \n"
255 "paddd %%mm1, %%mm2 # mm2 = a3x2 \n"
256 "movd %%mm2, %1 \n"
257 : "=r"(a3x1), "=r"(a3x2) : "r"(i_v) );
258 #if 0
259 a3x1 = 2 * ( i_v[1] - i_v[4] ) + 5 * ( i_v[3] - i_v[2] );
260 a3x2 = 2 * ( i_v[5] - i_v[8] ) + 5 * ( i_v[7] - i_v[6] );
261 #endif
263 if( a3x1 < 0) a3x1 = -a3x1; /* abs( a3x1 ) */
264 if( a3x2 < 0) a3x2 = -a3x2; /* abs( a3x2 ) */
266 a3x0_ = PP_MIN3( a3x0, a3x1, a3x2 );
268 d = 5 *( a3x0 - a3x0_ ) / 8; /* always > 0 */
270 i_delta = ( i_v[4] - i_v[5] ) / 2;
271 /* clip into [0, i_delta] or [i_delta, 0] */
272 if( i_delta < 0 )
274 if( !b_neg ) /* since true d has sgn(d) = - sgn( a3x0 ) */
276 d = -d;
277 if( d < i_delta ) d = i_delta;
278 i_v[4] -= d;
279 i_v[5] += d;
282 else
284 if( b_neg )
286 if( d > i_delta ) d = i_delta;
287 i_v[4] -= d;
288 i_v[5] += d;
296 static inline void pp_deblock_DCMode( uint8_t *p_v, /* = int i_v[10] */
297 int i_QP )
299 int i_p0, i_p9;
301 i_p0 = PP_ABS( p_v[1] - p_v[0] ) < i_QP ? p_v[0] : p_v[1];
302 i_p9 = PP_ABS( p_v[8] - p_v[9] ) < i_QP ? p_v[9] : p_v[8];
304 /* mm0 = 8 pix unmodified
305 -We will process first 4 pixel
306 mm0 = 8 pix unmodified
307 mm1 = for the first part of the 4 first pix
308 (v1) -> (p0) -> ... ( word )
309 (v2) (v1)
310 (v3) (v2)
311 (v4) (v3)
313 = for the commoin part between first and last pix
314 (v2) -> (v3) -> ... ( word )
315 (v3) (v4)
316 (v4) (v5)
317 (v5) (v6)
319 = for the last part of the 4 last pix
320 (v5) -> (v6) -> ... ( word )
321 (v6) (v7)
322 (v7) (v8)
323 (v8) (p9)
325 mm2 = acu for first new pix
326 mm3 = acu for last pix
327 mm4 = unused
328 mm5 = p0
329 mm6 = p9 << 48
330 mm7 = 0 */
331 __asm__ __volatile__ (
332 "pxor %%mm7, %%mm7 \n"
333 "movq 1(%0), %%mm0 # get 8 pix \n"
334 " # unpack into mm1 \n"
335 "movq %%mm0, %%mm1 \n"
336 "punpcklbw %%mm7, %%mm1 \n"
337 " # get p_0 and i_p9 \n"
338 "movd %1, %%mm5 \n"
339 "movd %2, %%mm6 \n"
340 "psllq $48, %%mm6 \n"
341 " \n"
342 "movq %%mm1, %%mm3 # p_v[5-8] = v[1-4] !! \n"
343 "movq %%mm1, %%mm2 \n"
344 "psllw $2, %%mm2 # p_v[1-4] = 4*v[1-4] \n"
345 " \n"
346 "psllq $16, %%mm1 \n"
347 "por %%mm5, %%mm1 # mm1 =( p0, v1, v2 ,v3)\n"
348 " \n"
349 "paddw %%mm1, %%mm2 \n"
350 "paddw %%mm1, %%mm2 \n"
351 " \n"
352 "psllq $16, %%mm1 \n"
353 "por %%mm5, %%mm1 # mm1 =( p0, p0, v1, v2)\n"
354 " \n"
355 "paddw %%mm1, %%mm2 \n"
356 "paddw %%mm1, %%mm2 \n"
357 " \n"
358 "psllq $16, %%mm1 \n"
359 "por %%mm5, %%mm1 # mm1 =( p0, p0, p0, v1)\n"
360 " \n"
361 "paddw %%mm1, %%mm2 \n"
362 " \n"
363 "psllq $16, %%mm1 \n"
364 "por %%mm5, %%mm1 # mm1 =( p0, p0, p0, p0)\n"
365 " \n"
366 "paddw %%mm1, %%mm2 \n"
367 " # Now last part a little borring\n"
368 " # last part for mm2, beginig for mm3\n"
369 "movq %%mm0, %%mm1 \n"
370 "psrlq $8, %%mm1 \n"
371 "punpcklbw %%mm7, %%mm1 # mm1 =( v2, v3, v4, v5 )\n"
372 "paddw %%mm1, %%mm2 \n"
373 "paddw %%mm1, %%mm2 \n"
374 "paddw %%mm1, %%mm3 \n"
376 " \n"
377 "movq %%mm0, %%mm1 \n"
378 "psrlq $16, %%mm1 \n"
379 "punpcklbw %%mm7, %%mm1 # mm1 =( v3, v4, v5, v6 )\n"
380 "psllw $1, %%mm1 \n"
381 "paddw %%mm1, %%mm2 \n"
382 "paddw %%mm1, %%mm3 \n"
383 " \n"
384 "movq %%mm0, %%mm1 \n"
385 "psrlq $24, %%mm1 \n"
386 "punpcklbw %%mm7, %%mm1 # mm1 =( v4, v5, v6, v7) \n"
387 "paddw %%mm1, %%mm2 \n"
388 "paddw %%mm1, %%mm3 \n"
389 "paddw %%mm1, %%mm3 \n"
390 " \n"
391 "movq %%mm0, %%mm1 \n"
392 "psrlq $32, %%mm1 \n"
393 "punpcklbw %%mm7, %%mm1 # mm1 =( v5, v6, v7, v8) \n"
394 "paddw %%mm1, %%mm2 \n"
395 "psllw $2, %%mm1 \n"
396 "paddw %%mm1, %%mm3 \n"
397 " # Now last part for last 4 pix \n"
398 " # \n"
399 "movq %%mm0, %%mm1 \n"
400 "punpckhbw %%mm7, %%mm1 # mm1 = ( v5, v6, v7, v8) \n"
401 " \n"
402 "psrlq $16, %%mm1 \n"
403 "por %%mm6, %%mm1 # mm1 =( v6, v7, v8, p9 )\n"
404 " \n"
405 "paddw %%mm1, %%mm3 \n"
406 "paddw %%mm1, %%mm3 \n"
407 " \n"
408 "psrlq $16, %%mm1 \n"
409 "por %%mm6, %%mm1 # mm1 =( v7, v8, p9, p9)\n"
410 " \n"
411 "paddw %%mm1, %%mm3 \n"
412 "paddw %%mm1, %%mm3 \n"
413 " \n"
414 "psrlq $16, %%mm1 \n"
415 "por %%mm6, %%mm1 # mm1 =( v8, p9, p9, p9 )\n"
416 " \n"
417 "paddw %%mm1, %%mm3 \n"
418 " \n"
419 "psrlq $16, %%mm1 \n"
420 "por %%mm6, %%mm1 # mm1 =( p9, p9, p9, p9 )\n"
421 " \n"
422 "paddw %%mm1, %%mm3 \n"
424 "psrlw $4, %%mm2 \n"
425 "psrlw $4, %%mm3 \n"
426 "packuswb %%mm3, %%mm2 \n"
427 "movq %%mm2, 1(%0) \n"
429 : : "r"(p_v), "r"(i_p0), "r"(i_p9) : "memory" );
431 #if 0
432 for( i = 1; i < 9; i++ )
434 v[i] = p_v[i]; /* save 8 pix that will be modified */
437 p_v[1] = ( 6 * i_p0 + 4 * v[1]
438 + 2 *( v[2] + v[3]) + v[4] + v[5]) >> 4;
440 p_v[2] = ( 4 * i_p0 + 2 * v[1] + 4 * v[2]
441 + 2 *( v[3] + v[4]) + v[5] + v[6]) >> 4;
443 p_v[3] = ( 2 * i_p0 + 2 * (v[1] + v[2]) + 4 * v[3]
444 + 2 *( v[4] + v[5]) + v[6] + v[7]) >> 4;
446 p_v[4] = ( i_p0 + v[1] + 2 * (v[2] + v[3]) + 4 * v[4]
447 + 2 *( v[5] + v[6]) + v[7] + v[8]) >> 4;
449 p_v[5] = ( v[1] + v[2] + 2 * (v[3] + v[4]) + 4 * v[5]
450 + 2 *( v[6] + v[7]) + v[8] + i_p9) >> 4;
452 p_v[6] = ( v[2] + v[3] + 2 * (v[4] + v[5]) + 4 * v[6]
453 + 2 *( v[7] + v[8]) + 2 * i_p9) >> 4;
455 p_v[7] = ( v[3] + v[4] + 2 * (v[5] + v[6]) + 4 * v[7]
456 + 2 * v[8] + 4 * i_p9) >> 4;
458 p_v[8] = ( v[4] + v[5] + 2 * (v[6] + v[7]) + 4 * v[8]
459 + 6 * i_p9) >> 4;
460 #endif
466 /*****************************************************************************/
467 /*---------------------------------------------------------------------------*/
468 /* */
469 /* ---------- filter Vertical lines so follow horizontal edges -------- */
470 /* */
471 /*---------------------------------------------------------------------------*/
472 /*****************************************************************************/
474 void E_( pp_deblock_V )( uint8_t *p_plane,
475 int i_width, int i_height, int i_stride,
476 QT_STORE_T *p_QP_store, int i_QP_stride,
477 int b_chroma )
479 int x, y, i;
480 uint8_t *p_v;
481 int i_QP_scale; /* use to do ( ? >> i_QP_scale ) */
482 int i_QP;
484 uint8_t i_v[10];
486 i_QP_scale = b_chroma ? 5 : 4 ;
488 for( y = 8; y < i_height - 4; y += 8 )
490 p_v = p_plane + ( y - 5 )* i_stride;
491 for( x = 0; x < i_width; x++ )
493 /* First get 10 vert pix to use them without i_stride */
494 for( i = 0; i < 10; i++ )
496 i_v[i] = p_v[i*i_stride + x];
499 i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
500 (x>>i_QP_scale)];
501 /* XXX QP is for v5 */
502 if( pp_deblock_isDC_mode( i_v ) )
504 if( pp_deblock_isMinMaxOk( i_v, i_QP ) )
506 pp_deblock_DCMode( i_v, i_QP );
509 else
511 pp_deblock_DefaultMode( i_v, i_stride, i_QP );
514 /* Copy back, XXX only 1-8 were modified */
515 for( i = 1; i < 9; i++ )
517 p_v[i*i_stride + x] = i_v[i];
523 return;
525 /*****************************************************************************/
526 /*---------------------------------------------------------------------------*/
527 /* */
528 /* --------- filter Horizontal lines so follow vertical edges -------- */
529 /* */
530 /*---------------------------------------------------------------------------*/
531 /*****************************************************************************/
533 void E_( pp_deblock_H )( uint8_t *p_plane,
534 int i_width, int i_height, int i_stride,
535 QT_STORE_T *p_QP_store, int i_QP_stride,
536 int b_chroma )
538 int x, y;
539 uint8_t *p_v;
540 int i_QP_scale;
541 int i_QP;
543 i_QP_scale = b_chroma ? 5 : 4 ;
545 for( y = 0; y < i_height; y++ )
547 p_v = p_plane + y * i_stride - 5;
548 for( x = 8; x < i_width - 4; x += 8 )
550 /* p_v point 5 pix before a block boundary */
551 /* XXX QP is for v5 */
552 i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
553 (x>>i_QP_scale)];
554 if( pp_deblock_isDC_mode( p_v + x ) )
556 if( pp_deblock_isMinMaxOk( p_v+ x, i_QP ) )
558 pp_deblock_DCMode( p_v+x, i_QP );
561 else
563 pp_deblock_DefaultMode( p_v+x, i_stride, i_QP );
568 return;
572 /*****************************************************************************
574 * Internals functions common to pp_Dering_Y pp_Dering_C
576 *****************************************************************************/
578 static inline void pp_dering_MinMax( uint8_t *p_block, int i_stride,
579 int *pi_min, int *pi_max )
581 int x, y;
582 int i_min, i_max;
584 #if 0
586 /* First we will extract min/max for each pix on vertical line
587 and next extract global min/max */
588 __asm__ __volatile__(
590 "leal (%2,%3), %%eax \n"
591 "movq (%2), %%mm0 #load line \n"
592 "movq %%mm0, %%mm1 \n"
594 MMXEXT_GET_LMINMAX( (%%eax), %%mm0, %%mm1, %%mm7 )
595 MMXEXT_GET_LMINMAX( (%%eax, %3), %%mm0, %%mm1, %%mm7 )
596 MMXEXT_GET_LMINMAX( (%%eax, %3,2), %%mm0, %%mm1, %%mm7 )
597 MMXEXT_GET_LMINMAX( (%2, %3, 4), %%mm0, %%mm1, %%mm7 )
598 "leal (%%eax,%3,4), %%eax \n"
600 MMXEXT_GET_LMINMAX( (%%eax), %%mm0, %%mm1, %%mm7 )
601 MMXEXT_GET_LMINMAX( (%%eax, %3), %%mm0, %%mm1, %%mm7 )
602 MMXEXT_GET_LMINMAX( (%%eax, %3,2), %%mm0, %%mm1, %%mm7 )
603 MMXEXT_GET_PMIN( %%mm0, %%mm7 )
604 MMXEXT_GET_PMAX( %%mm1, %%mm7 )
605 "movd %%mm0, %%eax \n"
606 "andl $255, %%eax \n"
607 "movl %%eax, (%0) \n"
608 "movd %%mm1, %%eax \n"
609 "andl $255, %%eax \n"
610 "movl %%eax, (%1) \n"
612 : : "r"(pi_min), "r"(pi_max), "r"(p_block), "r"(i_stride) : "%eax", "memory" );
613 #endif
615 i_min = 255; i_max = 0;
617 for( y = 0; y < 8; y++ )
619 for( x = 0; x < 8; x++ )
621 if( i_min > p_block[x] ) i_min = p_block[x];
622 if( i_max < p_block[x] ) i_max = p_block[x];
624 p_block += i_stride;
627 *pi_min = i_min;
628 *pi_max = i_max;
632 static inline void pp_dering_BinIndex( uint8_t *p_block, int i_stride,
633 int i_thr, uint32_t *p_bin )
635 int x, y;
636 uint32_t i_bin;
638 for( y = 0; y < 10; y++ )
640 i_bin = 0;
641 for( x = 0; x < 10; x++ )
643 if( p_block[x] > i_thr )
645 i_bin |= 1 << x;
648 i_bin |= (~i_bin) << 16; /* for detect also three 0 */
649 *p_bin = i_bin&( i_bin >> 1 )&( i_bin << 1 );
651 p_block += i_stride;
652 p_bin++;
656 static inline void pp_dering_Filter( uint8_t *p_block, int i_stride,
657 uint32_t *p_bin,
658 int i_QP )
660 int x, y;
661 uint32_t i_bin;
662 int i_flt[8][8];
663 int i_f;
664 uint8_t *p_sav;
665 int i_QP_2;
667 p_sav = p_block;
668 i_QP_2 = i_QP >> 1;
670 for( y = 0; y < 8; y++ )
672 i_bin = p_bin[y] & p_bin[y+1] & p_bin[y+2]; /* To be optimised */
673 i_bin |= i_bin >> 16; /* detect 0 or 1 */
675 for( x = 0; x < 8; x++ )
677 if( i_bin&0x02 ) /* 0x02 since 10 index but want 1-9 */
679 /* apply dering */
680 /* 1 2 1
681 2 4 2 + (8)
682 1 2 1 */
683 i_f = p_block[x - i_stride - 1] +
684 ( p_block[x - i_stride ] << 1)+
685 p_block[x - i_stride + 1] +
687 ( p_block[x - 1] << 1 )+
688 ( p_block[x ] << 2 )+
689 ( p_block[x + 1] << 1 )+
691 p_block[x + i_stride - 1] +
692 ( p_block[x + i_stride ] << 1 ) +
693 p_block[x + i_stride + 1];
695 i_f = ( 8 + i_f ) >> 4;
697 /* Clamp this value */
699 if( i_f - p_block[x] > ( i_QP_2 ) )
701 i_flt[y][x] = p_block[x] + i_QP_2;
703 else
704 if( i_f - p_block[x] < -i_QP_2 )
706 i_flt[y][x] = p_block[x] - i_QP_2;
708 else
710 i_flt[y][x] = i_f ;
713 else
715 i_flt[y][x] = p_block[x];
717 i_bin >>= 1;
720 p_block += i_stride;
722 for( y = 0; y < 8; y++ )
724 for( x = 0; x < 8; x++ )
726 p_sav[x] = i_flt[y][x];
728 p_sav+= i_stride;
733 /*****************************************************************************/
734 /*---------------------------------------------------------------------------*/
735 /* */
736 /* ----------------- Dering filter on Y and C blocks ----------------- */
737 /* */
738 /*---------------------------------------------------------------------------*/
739 /*****************************************************************************/
741 void E_( pp_dering_Y )( uint8_t *p_plane,
742 int i_width, int i_height, int i_stride,
743 QT_STORE_T *p_QP_store, int i_QP_stride )
745 int x, y, k;
746 int i_max[4], i_min[4], i_range[4];
747 int i_thr[4];
748 int i_max_range, i_kmax;
749 uint32_t i_bin[4][10];
750 uint8_t *p_block[4];
751 QT_STORE_T *p_QP;
753 /* We process 4 blocks/loop*/
754 for( y = 8; y < i_height-8; y += 16 )
756 /* +---+
757 |0|1|
758 +-+-+ :))
759 |2|3|
760 +-+-+ */
762 p_block[0] = p_plane + y * i_stride + 8;
763 p_block[1] = p_block[0] + 8;
764 p_block[2] = p_block[0] + ( i_stride << 3 );
765 p_block[3] = p_block[2] + 8;
767 for( x = 8; x < i_width-8; x += 16 )
769 /* 1: Calculate threshold */
770 /* Calculate max/min for each block */
771 pp_dering_MinMax( p_block[0], i_stride, &i_min[0], &i_max[0] );
772 pp_dering_MinMax( p_block[1], i_stride, &i_min[1], &i_max[1] );
773 pp_dering_MinMax( p_block[2], i_stride, &i_min[2], &i_max[2] );
774 pp_dering_MinMax( p_block[3], i_stride, &i_min[3], &i_max[3] );
775 /* Calculate range, max_range and thr */
776 i_max_range = 0; i_kmax = 0;
777 for( k = 0; k <= 4; k++ )
779 i_range[k] = i_max[k] - i_min[k];
780 i_thr[k] = ( i_max[k] + i_min[k] + 1 )/2;
781 if( i_max_range < i_max[k])
783 i_max_range = i_max[k];
784 i_kmax = k;
787 /* Now rearrange thr */
788 if( i_max_range > 64 )
790 for( k = 1; k < 5; k++ )
792 if( i_range[k] < 16 )
794 i_thr[k] = 0;
796 else
797 if( i_range[k] < 32 )
799 i_thr[k] = i_thr[i_kmax];
803 else
805 for( k = 1; k < 5; k++ )
807 if( i_range[k] < 16 )
809 i_thr[k] = 0;
813 /* 2: Index acquisition 10x10 ! so " -i_stride - 1"*/
814 pp_dering_BinIndex( p_block[0] - i_stride - 1, i_stride,
815 i_thr[0], i_bin[0] );
816 pp_dering_BinIndex( p_block[1] - i_stride - 1, i_stride,
817 i_thr[1], i_bin[1] );
818 pp_dering_BinIndex( p_block[2] - i_stride - 1, i_stride,
819 i_thr[2], i_bin[2] );
820 pp_dering_BinIndex( p_block[3] - i_stride - 1, i_stride,
821 i_thr[3], i_bin[3] );
824 /* 3: adaptive smoothing */
825 /* since we begin at (8,8) QP can be different for each block */
826 p_QP = &( p_QP_store[( y >> 4) * i_QP_stride + (x >> 4)] );
828 pp_dering_Filter( p_block[0], i_stride,
829 i_bin[0], p_QP[0] );
831 pp_dering_Filter( p_block[1], i_stride,
832 i_bin[1], p_QP[1] );
834 pp_dering_Filter( p_block[2], i_stride,
835 i_bin[2], p_QP[i_QP_stride] );
837 pp_dering_Filter( p_block[3], i_stride,
838 i_bin[3], p_QP[i_QP_stride+1] );
840 p_block[0] += 8;
841 p_block[1] += 8;
842 p_block[2] += 8;
843 p_block[3] += 8;
849 void E_( pp_dering_C )( uint8_t *p_plane,
850 int i_width, int i_height, int i_stride,
851 QT_STORE_T *p_QP_store, int i_QP_stride )
853 int x, y;
854 int i_max, i_min;
855 int i_thr;
856 uint32_t i_bin[10];
858 uint8_t *p_block;
861 for( y = 8; y < i_height-8; y += 8 )
864 p_block = p_plane + y * i_stride + 8;
865 for( x = 8; x < i_width-8; x += 8 )
868 /* 1: Calculate threshold */
869 /* Calculate max/min for each block */
870 pp_dering_MinMax( p_block, i_stride,
871 &i_min, &i_max );
872 /* Calculate thr*/
873 i_thr = ( i_max + i_min + 1 )/2;
875 /* 2: Index acquisition 10x10 */
876 /* point on 10x10 in wich we have our 8x8 block */
877 pp_dering_BinIndex( p_block - i_stride -1, i_stride,
878 i_thr,
879 i_bin );
881 /* 3: adaptive smoothing */
882 pp_dering_Filter( p_block, i_stride,
883 i_bin,
884 p_QP_store[(y>>5)*i_QP_stride+ (x>>5)]);
885 p_block += 8;