r1009: Move the dependencies to newer package names
[cinelerra_cv/mob.git] / mpeg2enc / motion.c
blob8cee7bca387360302fbaa1c07eaeea5e0bc32b42
1 /* motion.c, motion estimation */
3 /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
5 /*
6 * Disclaimer of Warranty
8 * These software programs are available to the user without any license fee or
9 * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
10 * any and all warranties, whether express, implied, or statuary, including any
11 * implied warranties or merchantability or of fitness for a particular
12 * purpose. In no event shall the copyright-holder be liable for any
13 * incidental, punitive, or consequential damages of any kind whatsoever
14 * arising from the use of these programs.
16 * This disclaimer of warranty extends to the user of these programs and user's
17 * customers, employees, agents, transferees, successors, and assigns.
19 * The MPEG Software Simulation Group does not represent or warrant that the
20 * programs furnished hereunder are free of infringement of any third-party
21 * patents.
23 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
24 * are subject to royalty fees to patent holders. Many of these patents are
25 * general enough such that they are unavoidable regardless of implementation
26 * design.
30 #include <limits.h>
31 #include <stdio.h>
32 #include "config.h"
33 #include "global.h"
34 #include "cpu_accel.h"
35 #include "simd.h"
37 /* Macro-block Motion compensation results record */
39 typedef struct _blockcrd {
40 int16_t x;
41 int16_t y;
42 } blockxy;
44 struct mb_motion
46 blockxy pos; // Half-pel co-ordinates of source block
47 int sad; // Sum of absolute difference
48 int var;
49 uint8_t *blk ; // Source block data (in luminace data array)
50 int hx, hy; // Half-pel offsets
51 int fieldsel; // 0 = top 1 = bottom
52 int fieldoff; // Offset from start of frame data to first line
53 // of field. (top = 0, bottom = width );
56 typedef struct mb_motion mb_motion_s;
59 struct subsampled_mb
61 uint8_t *mb; // One pel
62 uint8_t *fmb; // Two-pel subsampled
63 uint8_t *qmb; // Four-pel subsampled
64 uint8_t *umb; // U compoenent one-pel
65 uint8_t *vmb; // V component one-pel
68 typedef struct subsampled_mb subsampled_mb_s;
71 static void frame_ME (motion_engine_t *engine,
72 pict_data_s *picture,
73 motion_comp_s *mc,
74 int mboffset,
75 int i, int j, struct mbinfo *mbi);
77 static void field_ME (motion_engine_t *engine,
78 pict_data_s *picture,
79 motion_comp_s *mc,
80 int mboffset,
81 int i, int j,
82 struct mbinfo *mbi,
83 int secondfield,
84 int ipflag);
86 static void frame_estimate (motion_engine_t *engine,
87 uint8_t *org,
88 uint8_t *ref,
89 subsampled_mb_s *ssmb,
90 int i, int j,
91 int sx, int sy,
92 mb_motion_s *bestfr,
93 mb_motion_s *besttop,
94 mb_motion_s *bestbot,
95 int imins[2][2], int jmins[2][2]);
97 static void field_estimate (motion_engine_t *engine,
98 pict_data_s *picture,
99 uint8_t *toporg,
100 uint8_t *topref,
101 uint8_t *botorg,
102 uint8_t *botref,
103 subsampled_mb_s *ssmb,
104 int i, int j, int sx, int sy, int ipflag,
105 mb_motion_s *bestfr,
106 mb_motion_s *best8u,
107 mb_motion_s *best8l,
108 mb_motion_s *bestsp);
110 static void dpframe_estimate (motion_engine_t *engine,
111 pict_data_s *picture,
112 uint8_t *ref,
113 subsampled_mb_s *ssmb,
114 int i, int j, int iminf[2][2], int jminf[2][2],
115 mb_motion_s *dpbest,
116 int *imindmvp, int *jmindmvp,
117 int *vmcp);
119 static void dpfield_estimate (motion_engine_t *engine,
120 pict_data_s *picture,
121 uint8_t *topref,
122 uint8_t *botref,
123 uint8_t *mb,
124 int i, int j,
125 int imins, int jmins,
126 mb_motion_s *dpbest,
127 int *vmcp);
129 static void fullsearch (motion_engine_t *engine,
130 uint8_t *org, uint8_t *ref,
131 subsampled_mb_s *ssblk,
132 int lx, int i0, int j0,
133 int sx, int sy, int h,
134 int xmax, int ymax,
135 mb_motion_s *motion );
137 static void find_best_one_pel( motion_engine_t *engine,
138 uint8_t *org, uint8_t *blk,
139 int searched_size,
140 int i0, int j0,
141 int ilow, int jlow,
142 int xmax, int ymax,
143 int lx, int h,
144 mb_motion_s *res
146 static int build_sub22_mcomps( motion_engine_t *engine,
147 int i0, int j0, int ihigh, int jhigh,
148 int null_mc_sad,
149 uint8_t *s22org, uint8_t *s22blk,
150 int flx, int fh, int searched_sub44_size );
152 #ifdef X86_CPU
153 static void find_best_one_pel_mmxe( motion_engine_t *engine,
154 uint8_t *org, uint8_t *blk,
155 int searched_size,
156 int i0, int j0,
157 int ilow, int jlow,
158 int xmax, int ymax,
159 int lx, int h,
160 mb_motion_s *res
163 static int build_sub22_mcomps_mmxe( motion_engine_t *engine, int i0, int j0, int ihigh, int jhigh,
164 int null_mc_sad,
165 uint8_t *s22org, uint8_t *s22blk,
166 int flx, int fh, int searched_sub44_size );
167 static int (*pmblock_sub44_dists)( uint8_t *blk, uint8_t *ref,
168 int ilow, int jlow,
169 int ihigh, int jhigh,
170 int h, int rowstride,
171 int threshold,
172 mc_result_s *resvec);
173 #endif
175 static int unidir_pred_var( const mb_motion_s *motion,
176 uint8_t *mb, int lx, int h);
177 static int bidir_pred_var( const mb_motion_s *motion_f,
178 const mb_motion_s *motion_b,
179 uint8_t *mb, int lx, int h);
180 static int bidir_pred_sad( const mb_motion_s *motion_f,
181 const mb_motion_s *motion_b,
182 uint8_t *mb, int lx, int h);
184 static int variance( uint8_t *mb, int size, int lx);
186 static int dist22 ( uint8_t *blk1, uint8_t *blk2, int qlx, int qh);
188 static int dist44 ( uint8_t *blk1, uint8_t *blk2, int flx, int fh);
189 static int dist2_22( uint8_t *blk1, uint8_t *blk2,
190 int lx, int h);
191 static int bdist2_22( uint8_t *blk1f, uint8_t *blk1b,
192 uint8_t *blk2,
193 int lx, int h);
196 static void (*pfind_best_one_pel)( motion_engine_t *engine,
197 uint8_t *org, uint8_t *blk,
198 int searched_size,
199 int i0, int j0,
200 int ilow, int jlow,
201 int xmax, int ymax,
202 int lx, int h,
203 mb_motion_s *res
205 static int (*pbuild_sub22_mcomps)( motion_engine_t *engine,
206 int i0, int j0, int ihigh, int jhigh,
207 int null_mc_sad,
208 uint8_t *s22org, uint8_t *s22blk,
209 int flx, int fh, int searched_sub44_size );
211 static int (*pdist2_22)( uint8_t *blk1, uint8_t *blk2,
212 int lx, int h);
213 static int (*pbdist2_22)( uint8_t *blk1f, uint8_t *blk1b,
214 uint8_t *blk2,
215 int lx, int h);
217 static int dist1_00( uint8_t *blk1, uint8_t *blk2, int lx, int h, int distlim);
218 static int dist1_01(uint8_t *blk1, uint8_t *blk2, int lx, int h);
219 static int dist1_10(uint8_t *blk1, uint8_t *blk2, int lx, int h);
220 static int dist1_11(uint8_t *blk1, uint8_t *blk2, int lx, int h);
221 static int dist2 (uint8_t *blk1, uint8_t *blk2,
222 int lx, int hx, int hy, int h);
223 static int bdist2 (uint8_t *pf, uint8_t *pb,
224 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
225 static int bdist1 (uint8_t *pf, uint8_t *pb,
226 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
229 static int (*pdist22) ( uint8_t *blk1, uint8_t *blk2, int flx, int fh);
230 static int (*pdist44) ( uint8_t *blk1, uint8_t *blk2, int qlx, int qh);
231 static int (*pdist1_00) ( uint8_t *blk1, uint8_t *blk2, int lx, int h, int distlim);
232 static int (*pdist1_01) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
233 static int (*pdist1_10) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
234 static int (*pdist1_11) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
236 static int (*pdist2) (uint8_t *blk1, uint8_t *blk2,
237 int lx, int hx, int hy, int h);
240 static int (*pbdist2) (uint8_t *pf, uint8_t *pb,
241 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
243 static int (*pbdist1) (uint8_t *pf, uint8_t *pb,
244 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
247 /* DEBUGGER...
248 static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str )
250 rx *= 2; ry *= 2;
251 if( motion->sad < 0 || motion->sad > 0x10000 )
253 printf( "SAD ooops %s\n", str );
254 exit(1);
256 if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx )
258 printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx );
259 exit(1);
261 if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry )
263 printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry );
264 exit(1);
268 static void init_mc( mb_motion_s *motion )
270 motion->sad = -123;
271 motion->pos.x = -1000;
272 motion->pos.y = -1000;
276 /* unidir_NI_var_sum
277 Compute the combined variance of luminance and chrominance information
278 for a particular non-intra macro block after unidirectional
279 motion compensation...
281 Note: results are scaled to give chrominance equal weight to
282 chrominance. The variance of the luminance portion is computed
283 at the time the motion compensation is computed.
285 TODO: Perhaps we should compute the whole thing in fullsearch not
286 seperate it. However, that would involve a lot of fiddling with
287 field_* and until its thoroughly debugged and tested I think I'll
288 leave that alone. Furthermore, it is unclear if its really worth
289 doing these computations for B *and* P frames.
291 TODO: BUG: ONLY works for 420 video...
295 static int unidir_chrom_var_sum( mb_motion_s *lum_mc,
296 uint8_t **ref,
297 subsampled_mb_s *ssblk,
298 int lx, int h )
300 int uvlx = (lx>>1);
301 int uvh = (h>>1);
302 /* N.b. MC co-ordinates are computed in half-pel units! */
303 int cblkoffset = (lum_mc->fieldoff>>1) +
304 (lum_mc->pos.x>>2) + (lum_mc->pos.y>>2)*uvlx;
306 return ((*pdist2_22)( ref[1] + cblkoffset, ssblk->umb, uvlx, uvh) +
307 (*pdist2_22)( ref[2] + cblkoffset, ssblk->vmb, uvlx, uvh))*2;
311 bidir_NI_var_sum
312 Compute the combined variance of luminance and chrominance information
313 for a particular non-intra macro block after unidirectional
314 motion compensation...
316 Note: results are scaled to give chrominance equal weight to
317 chrominance. The variance of the luminance portion is computed
318 at the time the motion compensation is computed.
320 Note: results scaled to give chrominance equal weight to chrominance.
322 TODO: BUG: ONLY works for 420 video...
324 NOTE: Currently unused but may be required if it turns out that taking
325 chrominance into account in B frames is needed.
329 int bidir_chrom_var_sum( mb_motion_s *lum_mc_f,
330 mb_motion_s *lum_mc_b,
331 uint8_t **ref_f,
332 uint8_t **ref_b,
333 subsampled_mb_s *ssblk,
334 int lx, int h )
336 int uvlx = (lx>>1);
337 int uvh = (h>>1);
338 /* N.b. MC co-ordinates are computed in half-pel units! */
339 int cblkoffset_f = (lum_mc_f->fieldoff>>1) +
340 (lum_mc_f->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
341 int cblkoffset_b = (lum_mc_b->fieldoff>>1) +
342 (lum_mc_b->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
344 return (
345 (*pbdist2_22)( ref_f[1] + cblkoffset_f, ref_b[1] + cblkoffset_b,
346 ssblk->umb, uvlx, uvh ) +
347 (*pbdist2_22)( ref_f[2] + cblkoffset_f, ref_b[2] + cblkoffset_b,
348 ssblk->vmb, uvlx, uvh ))*2;
352 static int chrom_var_sum( subsampled_mb_s *ssblk, int h, int lx )
354 return (variance(ssblk->umb,(h>>1),(lx>>1)) +
355 variance(ssblk->vmb,(h>>1),(lx>>1))) * 2;
359 * frame picture motion estimation
361 * org: top left pel of source reference frame
362 * ref: top left pel of reconstructed reference frame
363 * ssmb: macroblock to be matched
364 * i,j: location of mb relative to ref (=center of search window)
365 * sx,sy: half widths of search window
366 * besfr: location and SAD of best frame prediction
367 * besttop: location of best field pred. for top field of mb
368 * bestbo : location of best field pred. for bottom field of mb
371 static void frame_estimate(motion_engine_t *engine,
372 uint8_t *org,
373 uint8_t *ref,
374 subsampled_mb_s *ssmb,
375 int i, int j, int sx, int sy,
376 mb_motion_s *bestfr,
377 mb_motion_s *besttop,
378 mb_motion_s *bestbot,
379 int imins[2][2],
380 int jmins[2][2]
383 subsampled_mb_s botssmb;
384 mb_motion_s topfld_mc;
385 mb_motion_s botfld_mc;
387 botssmb.mb = ssmb->mb+width;
388 botssmb.fmb = ssmb->mb+(width>>1);
389 botssmb.qmb = ssmb->qmb+(width>>2);
390 botssmb.umb = ssmb->umb+(width>>1);
391 botssmb.vmb = ssmb->vmb+(width>>1);
393 /* frame prediction */
394 fullsearch(engine, org,ref,ssmb,width,i,j,sx,sy,16,width,height,
395 bestfr );
396 bestfr->fieldsel = 0;
397 bestfr->fieldoff = 0;
399 /* predict top field from top field */
400 fullsearch(engine, org,ref,ssmb,width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
401 &topfld_mc);
403 /* predict top field from bottom field */
404 fullsearch(engine, org+width,ref+width,ssmb, width<<1,i,j>>1,sx,sy>>1,8,
405 width,height>>1, &botfld_mc);
407 /* set correct field selectors... */
408 topfld_mc.fieldsel = 0;
409 botfld_mc.fieldsel = 1;
410 topfld_mc.fieldoff = 0;
411 botfld_mc.fieldoff = width;
413 imins[0][0] = topfld_mc.pos.x;
414 jmins[0][0] = topfld_mc.pos.y;
415 imins[1][0] = botfld_mc.pos.x;
416 jmins[1][0] = botfld_mc.pos.y;
418 /* select prediction for top field */
419 if (topfld_mc.sad<=botfld_mc.sad)
421 *besttop = topfld_mc;
423 else
425 *besttop = botfld_mc;
428 /* predict bottom field from top field */
429 fullsearch(engine, org,ref,&botssmb,
430 width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
431 &topfld_mc);
433 /* predict bottom field from bottom field */
434 fullsearch(engine, org+width,ref+width,&botssmb,
435 width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
436 &botfld_mc);
438 /* set correct field selectors... */
439 topfld_mc.fieldsel = 0;
440 botfld_mc.fieldsel = 1;
441 topfld_mc.fieldoff = 0;
442 botfld_mc.fieldoff = width;
444 imins[0][1] = topfld_mc.pos.x;
445 jmins[0][1] = topfld_mc.pos.y;
446 imins[1][1] = botfld_mc.pos.x;
447 jmins[1][1] = botfld_mc.pos.y;
449 /* select prediction for bottom field */
450 if (botfld_mc.sad<=topfld_mc.sad)
452 *bestbot = botfld_mc;
454 else
456 *bestbot = topfld_mc;
461 * field picture motion estimation subroutine
463 * toporg: address of original top reference field
464 * topref: address of reconstructed top reference field
465 * botorg: address of original bottom reference field
466 * botref: address of reconstructed bottom reference field
467 * ssmmb: macroblock to be matched
468 * i,j: location of mb (=center of search window)
469 * sx,sy: half width/height of search window
471 * bestfld: location and distance of best field prediction
472 * best8u: location of best 16x8 pred. for upper half of mb
473 * best8lp: location of best 16x8 pred. for lower half of mb
474 * bdestsp: location and distance of best same parity field
475 * prediction (needed for dual prime, only valid if
476 * ipflag==0)
479 static void field_estimate (motion_engine_t *engine,
480 pict_data_s *picture,
481 uint8_t *toporg,
482 uint8_t *topref,
483 uint8_t *botorg,
484 uint8_t *botref,
485 subsampled_mb_s *ssmb,
486 int i, int j, int sx, int sy, int ipflag,
487 mb_motion_s *bestfld,
488 mb_motion_s *best8u,
489 mb_motion_s *best8l,
490 mb_motion_s *bestsp)
493 mb_motion_s topfld_mc;
494 mb_motion_s botfld_mc;
495 int dt, db;
496 int notop, nobot;
497 subsampled_mb_s botssmb;
499 botssmb.mb = ssmb->mb+width;
500 botssmb.umb = ssmb->umb+(width>>1);
501 botssmb.vmb = ssmb->vmb+(width>>1);
502 botssmb.fmb = ssmb->fmb+(width>>1);
503 botssmb.qmb = ssmb->qmb+(width>>2);
505 /* if ipflag is set, predict from field of opposite parity only */
506 notop = ipflag && (picture->pict_struct==TOP_FIELD);
507 nobot = ipflag && (picture->pict_struct==BOTTOM_FIELD);
509 /* field prediction */
511 /* predict current field from top field */
512 if (notop)
513 topfld_mc.sad = dt = 65536; /* infinity */
514 else
515 fullsearch(engine, toporg,topref,ssmb,width<<1,
516 i,j,sx,sy>>1,16,width,height>>1,
517 &topfld_mc);
518 dt = topfld_mc.sad;
519 /* predict current field from bottom field */
520 if (nobot)
521 botfld_mc.sad = db = 65536; /* infinity */
522 else
523 fullsearch(engine, botorg,botref,ssmb,width<<1,
524 i,j,sx,sy>>1,16,width,height>>1,
525 &botfld_mc);
526 db = botfld_mc.sad;
527 /* Set correct field selectors */
528 topfld_mc.fieldsel = 0;
529 botfld_mc.fieldsel = 1;
530 topfld_mc.fieldoff = 0;
531 botfld_mc.fieldoff = width;
533 /* same parity prediction (only valid if ipflag==0) */
534 if (picture->pict_struct==TOP_FIELD)
536 *bestsp = topfld_mc;
538 else
540 *bestsp = botfld_mc;
543 /* select field prediction */
544 if (dt<=db)
546 *bestfld = topfld_mc;
548 else
550 *bestfld = botfld_mc;
554 /* 16x8 motion compensation */
556 /* predict upper half field from top field */
557 if (notop)
558 topfld_mc.sad = dt = 65536;
559 else
560 fullsearch(engine, toporg,topref,ssmb,width<<1,
561 i,j,sx,sy>>1,8,width,height>>1,
562 &topfld_mc);
563 dt = topfld_mc.sad;
564 /* predict upper half field from bottom field */
565 if (nobot)
566 botfld_mc.sad = db = 65536;
567 else
568 fullsearch(engine, botorg,botref,ssmb,width<<1,
569 i,j,sx,sy>>1,8,width,height>>1,
570 &botfld_mc);
571 db = botfld_mc.sad;
573 /* Set correct field selectors */
574 topfld_mc.fieldsel = 0;
575 botfld_mc.fieldsel = 1;
576 topfld_mc.fieldoff = 0;
577 botfld_mc.fieldoff = width;
579 /* select prediction for upper half field */
580 if (dt<=db)
582 *best8u = topfld_mc;
584 else
586 *best8u = botfld_mc;
589 /* predict lower half field from top field */
591 N.b. For interlaced data width<<4 (width*16) takes us 8 rows
592 down in the same field.
593 Thus for the fast motion data (2*2
594 sub-sampled) we need to go 4 rows down in the same field.
595 This requires adding width*4 = (width<<2).
596 For the 4*4 sub-sampled motion data we need to go down 2 rows.
597 This requires adding width = width
600 if (notop)
601 topfld_mc.sad = dt = 65536;
602 else
603 fullsearch(engine, toporg,topref,&botssmb,
604 width<<1,
605 i,j+8,sx,sy>>1,8,width,height>>1,
606 /* &imint,&jmint, &dt,*/ &topfld_mc);
607 dt = topfld_mc.sad;
608 /* predict lower half field from bottom field */
609 if (nobot)
610 botfld_mc.sad = db = 65536;
611 else
612 fullsearch(engine, botorg,botref,&botssmb,width<<1,
613 i,j+8,sx,sy>>1,8,width,height>>1,
614 /* &iminb,&jminb, &db,*/ &botfld_mc);
615 db = botfld_mc.sad;
616 /* Set correct field selectors */
617 topfld_mc.fieldsel = 0;
618 botfld_mc.fieldsel = 1;
619 topfld_mc.fieldoff = 0;
620 botfld_mc.fieldoff = width;
622 /* select prediction for lower half field */
623 if (dt<=db)
625 *best8l = topfld_mc;
627 else
629 *best8l = botfld_mc;
633 static void dpframe_estimate (motion_engine_t *engine,
634 pict_data_s *picture,
635 uint8_t *ref,
636 subsampled_mb_s *ssmb,
638 int i, int j, int iminf[2][2], int jminf[2][2],
639 mb_motion_s *best_mc,
640 int *imindmvp, int *jmindmvp,
641 int *vmcp)
643 int pref,ppred,delta_x,delta_y;
644 int is,js,it,jt,ib,jb,it0,jt0,ib0,jb0;
645 int imins,jmins,imint,jmint,iminb,jminb,imindmv,jmindmv;
646 int vmc,local_dist;
648 /* Calculate Dual Prime distortions for 9 delta candidates
649 * for each of the four minimum field vectors
650 * Note: only for P pictures!
653 /* initialize minimum dual prime distortion to large value */
654 vmc = INT_MAX;
656 for (pref=0; pref<2; pref++)
658 for (ppred=0; ppred<2; ppred++)
660 /* convert Cartesian absolute to relative motion vector
661 * values (wrt current macroblock address (i,j)
663 is = iminf[pref][ppred] - (i<<1);
664 js = jminf[pref][ppred] - (j<<1);
666 if (pref!=ppred)
668 /* vertical field shift adjustment */
669 if (ppred==0)
670 js++;
671 else
672 js--;
674 /* mvxs and mvys scaling*/
675 is<<=1;
676 js<<=1;
677 if (picture->topfirst == ppred)
679 /* second field: scale by 1/3 */
680 is = (is>=0) ? (is+1)/3 : -((-is+1)/3);
681 js = (js>=0) ? (js+1)/3 : -((-js+1)/3);
683 else
684 continue;
687 /* vector for prediction from field of opposite 'parity' */
688 if (picture->topfirst)
690 /* vector for prediction of top field from bottom field */
691 it0 = ((is+(is>0))>>1);
692 jt0 = ((js+(js>0))>>1) - 1;
694 /* vector for prediction of bottom field from top field */
695 ib0 = ((3*is+(is>0))>>1);
696 jb0 = ((3*js+(js>0))>>1) + 1;
698 else
700 /* vector for prediction of top field from bottom field */
701 it0 = ((3*is+(is>0))>>1);
702 jt0 = ((3*js+(js>0))>>1) - 1;
704 /* vector for prediction of bottom field from top field */
705 ib0 = ((is+(is>0))>>1);
706 jb0 = ((js+(js>0))>>1) + 1;
709 /* convert back to absolute half-pel field picture coordinates */
710 is += i<<1;
711 js += j<<1;
712 it0 += i<<1;
713 jt0 += j<<1;
714 ib0 += i<<1;
715 jb0 += j<<1;
717 if (is >= 0 && is <= (width-16)<<1 &&
718 js >= 0 && js <= (height-16))
720 for (delta_y=-1; delta_y<=1; delta_y++)
722 for (delta_x=-1; delta_x<=1; delta_x++)
724 /* opposite field coordinates */
725 it = it0 + delta_x;
726 jt = jt0 + delta_y;
727 ib = ib0 + delta_x;
728 jb = jb0 + delta_y;
730 if (it >= 0 && it <= (width-16)<<1 &&
731 jt >= 0 && jt <= (height-16) &&
732 ib >= 0 && ib <= (width-16)<<1 &&
733 jb >= 0 && jb <= (height-16))
735 /* compute prediction error */
736 local_dist = (*pbdist2)(
737 ref + (is>>1) + (width<<1)*(js>>1),
738 ref + width + (it>>1) + (width<<1)*(jt>>1),
739 ssmb->mb, /* current mb location */
740 width<<1, /* adjacent line distance */
741 is&1, js&1, it&1, jt&1, /* half-pel flags */
742 8); /* block height */
743 local_dist += (*pbdist2)(
744 ref + width + (is>>1) + (width<<1)*(js>>1),
745 ref + (ib>>1) + (width<<1)*(jb>>1),
746 ssmb->mb + width, /* current mb location */
747 width<<1, /* adjacent line distance */
748 is&1, js&1, ib&1, jb&1, /* half-pel flags */
749 8); /* block height */
751 /* update delta with least distortion vector */
752 if (local_dist < vmc)
754 imins = is;
755 jmins = js;
756 imint = it;
757 jmint = jt;
758 iminb = ib;
759 jminb = jb;
760 imindmv = delta_x;
761 jmindmv = delta_y;
762 vmc = local_dist;
765 } /* end delta x loop */
766 } /* end delta y loop */
771 /* TODO: This is now likely to be obsolete... */
772 /* Compute L1 error for decision purposes */
773 local_dist = (*pbdist1)(
774 ref + (imins>>1) + (width<<1)*(jmins>>1),
775 ref + width + (imint>>1) + (width<<1)*(jmint>>1),
776 ssmb->mb,
777 width<<1,
778 imins&1, jmins&1, imint&1, jmint&1,
780 //printf("motion 1 %p\n", pbdist1);
781 local_dist += (*pbdist1)(
782 ref + width + (imins>>1) + (width<<1)*(jmins>>1),
783 ref + (iminb>>1) + (width<<1)*(jminb>>1),
784 ssmb->mb + width,
785 width<<1,
786 imins&1, jmins&1, iminb&1, jminb&1,
788 //printf("motion 2\n");
790 best_mc->sad = local_dist;
791 best_mc->pos.x = imins;
792 best_mc->pos.y = jmins;
793 *vmcp = vmc;
794 *imindmvp = imindmv;
795 *jmindmvp = jmindmv;
796 //printf("motion 2\n");
799 static void dpfield_estimate(motion_engine_t *engine,
800 pict_data_s *picture,
801 uint8_t *topref,
802 uint8_t *botref,
803 uint8_t *mb,
804 int i, int j, int imins, int jmins,
805 mb_motion_s *bestdp_mc,
806 int *vmcp
810 uint8_t *sameref, *oppref;
811 int io0,jo0,io,jo,delta_x,delta_y,mvxs,mvys,mvxo0,mvyo0;
812 int imino,jmino,imindmv,jmindmv,vmc_dp,local_dist;
814 /* Calculate Dual Prime distortions for 9 delta candidates */
815 /* Note: only for P pictures! */
817 /* Assign opposite and same reference pointer */
818 if (picture->pict_struct==TOP_FIELD)
820 sameref = topref;
821 oppref = botref;
823 else
825 sameref = botref;
826 oppref = topref;
829 /* convert Cartesian absolute to relative motion vector
830 * values (wrt current macroblock address (i,j)
832 mvxs = imins - (i<<1);
833 mvys = jmins - (j<<1);
835 /* vector for prediction from field of opposite 'parity' */
836 mvxo0 = (mvxs+(mvxs>0)) >> 1; /* mvxs / / */
837 mvyo0 = (mvys+(mvys>0)) >> 1; /* mvys / / 2*/
839 /* vertical field shift correction */
840 if (picture->pict_struct==TOP_FIELD)
841 mvyo0--;
842 else
843 mvyo0++;
845 /* convert back to absolute coordinates */
846 io0 = mvxo0 + (i<<1);
847 jo0 = mvyo0 + (j<<1);
849 /* initialize minimum dual prime distortion to large value */
850 vmc_dp = 1 << 30;
852 for (delta_y = -1; delta_y <= 1; delta_y++)
854 for (delta_x = -1; delta_x <=1; delta_x++)
856 /* opposite field coordinates */
857 io = io0 + delta_x;
858 jo = jo0 + delta_y;
860 if (io >= 0 && io <= (width-16)<<1 &&
861 jo >= 0 && jo <= (height2-16)<<1)
863 /* compute prediction error */
864 local_dist = (*pbdist2)(
865 sameref + (imins>>1) + width2*(jmins>>1),
866 oppref + (io>>1) + width2*(jo>>1),
867 mb, /* current mb location */
868 width2, /* adjacent line distance */
869 imins&1, jmins&1, io&1, jo&1, /* half-pel flags */
870 16); /* block height */
872 /* update delta with least distortion vector */
873 if (local_dist < vmc_dp)
875 imino = io;
876 jmino = jo;
877 imindmv = delta_x;
878 jmindmv = delta_y;
879 vmc_dp = local_dist;
882 } /* end delta x loop */
883 } /* end delta y loop */
885 /* Compute L1 error for decision purposes */
886 bestdp_mc->sad =
887 (*pbdist1)(
888 sameref + (imins>>1) + width2*(jmins>>1),
889 oppref + (imino>>1) + width2*(jmino>>1),
890 mb, /* current mb location */
891 width2, /* adjacent line distance */
892 imins&1, jmins&1, imino&1, jmino&1, /* half-pel flags */
893 16); /* block height */
895 bestdp_mc->pos.x = imindmv;
896 bestdp_mc->pos.x = imindmv;
897 *vmcp = vmc_dp;
902 * Vectors of motion compensations
906 Take a vector of motion compensations and repeatedly make passes
907 discarding all elements whose sad "weight" is above the current mean weight.
910 static void sub_mean_reduction( mc_result_s *matches, int len,
911 int times,
912 int *newlen_res, int *minweight_res)
914 int i,j;
915 int weight_sum;
916 int mean_weight;
917 int min_weight = 100000;
918 if( len == 0 )
920 *minweight_res = 100000;
921 *newlen_res = 0;
922 return;
925 for(;;)
927 weight_sum = 0;
928 for( i = 0; i < len ; ++i )
929 weight_sum += matches[i].weight;
930 mean_weight = weight_sum / len;
932 if( times <= 0)
933 break;
935 j = 0;
936 for( i =0; i < len; ++i )
938 if( matches[i].weight <= mean_weight )
940 if( times == 1)
942 min_weight = matches[i].weight ;
944 matches[j] = matches[i];
945 ++j;
948 len = j;
949 --times;
951 *newlen_res = len;
952 *minweight_res = mean_weight;
956 Build a vector of the top 4*4 sub-sampled motion compensations in the box
957 (ilow,jlow) to (ihigh,jhigh).
959 The algorithm is as follows:
960 1. coarse matches on an 8*8 grid of positions are collected that fall below
961 a (very conservative) sad threshold (basically 50% more than moving average of
962 the mean sad of such matches).
964 2. The worse than-average matches are discarded.
966 3. The remaining coarse matches are expanded with the left/lower neighbouring
967 4*4 grid matches. Again only those better than a threshold (this time the mean
968 of the 8*8 grid matches are retained.
970 4. Multiple passes are made discarding worse than-average matches. The number of
971 passes is specified by the user. The default it 2 (leaving roughly 1/4 of the matches).
973 The net result is very fast and find good matches if they're to be found. I.e. the
974 penalty over exhaustive search is pretty low.
976 NOTE: The "discard below average" trick depends critically on having some variation in
977 the matches. The slight penalty imposed for distant matches (reasonably since the
978 motion vectors have to be encoded) is *vital* as otherwise pathologically bad
979 performance results on highly uniform images.
981 TODO: We should probably allow the user to eliminate the initial thinning of 8*8
982 grid matches if ultimate quality is demanded (e.g. for low bit-rate applications).
987 static int build_sub44_mcomps(motion_engine_t *engine,
988 int ilow, int jlow, int ihigh, int jhigh,
989 int i0, int j0,
990 int null_mc_sad,
991 uint8_t *s44org, uint8_t *s44blk, int qlx, int qh )
993 uint8_t *s44orgblk;
994 int istrt = ilow-i0;
995 int jstrt = jlow-j0;
996 int iend = ihigh-i0;
997 int jend = jhigh-j0;
998 int mean_weight;
999 int threshold;
1001 #ifdef X86_CPU
1003 /*int rangex, rangey;
1004 static int rough_num_mcomps;
1005 static mc_result_s rough_mcomps[MAX_44_MATCHES];
1006 int k;
1008 #else
1009 int i,j;
1010 int s1;
1011 uint8_t *old_s44orgblk;
1012 #endif
1013 /* N.b. we may ignore the right hand block of the pair going over the
1014 right edge as we have carefully allocated the buffer oversize to ensure
1015 no memory faults. The later motion compensation calculations
1016 performed on the results of this pass will filter out
1017 out-of-range blocks...
1021 engine->sub44_num_mcomps = 0;
1023 threshold = 6*null_mc_sad / (4*4*mc_44_red);
1024 s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1026 /* Exhaustive search on 4*4 sub-sampled data. This is affordable because
1027 (a) it is only 16th of the size of the real 1-pel data
1028 (b) we ignore those matches with an sad above our threshold.
1030 #ifndef X86_CPU
1032 /* Invariant: s44orgblk = s44org+(i>>2)+qlx*(j>>2) */
1033 s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1034 for( j = jstrt; j <= jend; j += 4 )
1036 old_s44orgblk = s44orgblk;
1037 for( i = istrt; i <= iend; i += 4 )
1039 s1 = ((*pdist44)( s44orgblk,s44blk,qlx,qh) & 0xffff) + abs(i-i0) + abs(j-j0);
1040 if( s1 < threshold )
1042 engine->sub44_mcomps[engine->sub44_num_mcomps].x = i;
1043 engine->sub44_mcomps[engine->sub44_num_mcomps].y = j;
1044 engine->sub44_mcomps[engine->sub44_num_mcomps].weight = s1 ;
1045 ++engine->sub44_num_mcomps;
1047 s44orgblk += 1;
1049 s44orgblk = old_s44orgblk + qlx;
1052 #else
1054 engine->sub44_num_mcomps
1055 = (*pmblock_sub44_dists)( s44orgblk, s44blk,
1056 istrt, jstrt,
1057 iend, jend,
1058 qh, qlx,
1059 threshold,
1060 engine->sub44_mcomps);
1062 #endif
1063 /* If we're really pushing quality we reduce once otherwise twice. */
1065 sub_mean_reduction( engine->sub44_mcomps, engine->sub44_num_mcomps, 1+(mc_44_red>1),
1066 &engine->sub44_num_mcomps, &mean_weight);
1069 return engine->sub44_num_mcomps;
1073 /* Build a vector of the best 2*2 sub-sampled motion
1074 compensations using the best 4*4 matches as starting points. As
1075 with with the 4*4 matches We don't collect them densely as they're
1076 just search starting points for 1-pel search and ones that are 1 out
1077 should still give better than average matches...
1082 static int build_sub22_mcomps(motion_engine_t *engine,
1083 int i0, int j0, int ihigh, int jhigh,
1084 int null_mc_sad,
1085 uint8_t *s22org, uint8_t *s22blk,
1086 int flx, int fh, int searched_sub44_size )
1088 int i,k,s;
1089 int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1091 int min_weight;
1092 int ilim = ihigh-i0;
1093 int jlim = jhigh-j0;
1094 blockxy matchrec;
1095 uint8_t *s22orgblk;
1097 engine->sub22_num_mcomps = 0;
1098 for( k = 0; k < searched_sub44_size; ++k )
1101 matchrec.x = engine->sub44_mcomps[k].x;
1102 matchrec.y = engine->sub44_mcomps[k].y;
1104 s22orgblk = s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1105 for( i = 0; i < 4; ++i )
1107 if( matchrec.x <= ilim && matchrec.y <= jlim )
1109 s = (*pdist22)( s22orgblk,s22blk,flx,fh);
1110 if( s < threshold )
1112 engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1113 engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1114 engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1115 ++engine->sub22_num_mcomps;
1119 if( i == 1 )
1121 s22orgblk += flx-1;
1122 matchrec.x -= 2;
1123 matchrec.y += 2;
1125 else
1127 s22orgblk += 1;
1128 matchrec.x += 2;
1136 sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1137 &engine->sub22_num_mcomps, &min_weight );
1138 return engine->sub22_num_mcomps;
1141 #ifdef X86_CPU
1142 int build_sub22_mcomps_mmxe(motion_engine_t *engine,
1143 int i0, int j0, int ihigh, int jhigh,
1144 int null_mc_sad,
1145 uint8_t *s22org, uint8_t *s22blk,
1146 int flx, int fh, int searched_sub44_size )
1148 int i,k,s;
1149 int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1151 int min_weight;
1152 int ilim = ihigh-i0;
1153 int jlim = jhigh-j0;
1154 blockxy matchrec;
1155 uint8_t *s22orgblk;
1156 int resvec[4];
1158 /* TODO: The calculation of the lstrow offset really belongs in
1159 asm code... */
1160 int lstrow=(fh-1)*flx;
1162 engine->sub22_num_mcomps = 0;
1163 for( k = 0; k < searched_sub44_size; ++k )
1166 matchrec.x = engine->sub44_mcomps[k].x;
1167 matchrec.y = engine->sub44_mcomps[k].y;
1169 s22orgblk = s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1170 mblockq_dist22_mmxe(s22orgblk+lstrow, s22blk+lstrow, flx, fh, resvec);
1171 for( i = 0; i < 4; ++i )
1173 if( matchrec.x <= ilim && matchrec.y <= jlim )
1175 s =resvec[i];
1176 if( s < threshold )
1178 engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1179 engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1180 engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1181 ++engine->sub22_num_mcomps;
1185 if( i == 1 )
1187 matchrec.x -= 2;
1188 matchrec.y += 2;
1190 else
1192 matchrec.x += 2;
1199 sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1200 &engine->sub22_num_mcomps, &min_weight );
1201 return engine->sub22_num_mcomps;
1204 #endif
1207 Search for the best 1-pel match within 1-pel of a good 2*2-pel match.
1208 TODO: Its a bit silly to cart around absolute M/C co-ordinates that
1209 eventually get turned into relative ones anyway...
1213 static void find_best_one_pel(motion_engine_t *engine,
1214 uint8_t *org, uint8_t *blk,
1215 int searched_size,
1216 int i0, int j0,
1217 int ilow, int jlow,
1218 int xmax, int ymax,
1219 int lx, int h,
1220 mb_motion_s *res
1224 int i,k;
1225 int d;
1226 blockxy minpos = res->pos;
1227 int dmin = INT_MAX;
1228 uint8_t *orgblk;
1229 int penalty;
1230 int init_search;
1231 int init_size;
1232 blockxy matchrec;
1234 init_search = searched_size;
1235 init_size = engine->sub22_num_mcomps;
1236 for( k = 0; k < searched_size; ++k )
1238 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1239 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1240 orgblk = org + matchrec.x+lx*matchrec.y;
1241 penalty = abs(matchrec.x)+abs(matchrec.y);
1243 for( i = 0; i < 4; ++i )
1245 if( matchrec.x <= xmax && matchrec.y <= ymax )
1248 d = penalty+(*pdist1_00)(orgblk,blk,lx,h, dmin);
1249 if (d<dmin)
1251 dmin = d;
1252 minpos = matchrec;
1255 if( i == 1 )
1257 orgblk += lx-1;
1258 matchrec.x -= 1;
1259 matchrec.y += 1;
1261 else
1263 orgblk += 1;
1264 matchrec.x += 1;
1269 res->pos = minpos;
1270 res->blk = org + minpos.x+lx*minpos.y;
1271 res->sad = dmin;
1275 #ifdef X86_CPU
1276 void find_best_one_pel_mmxe(motion_engine_t *engine,
1277 uint8_t *org, uint8_t *blk,
1278 int searched_size,
1279 int i0, int j0,
1280 int ilow, int jlow,
1281 int xmax, int ymax,
1282 int lx, int h,
1283 mb_motion_s *res
1287 int i,k;
1288 int d;
1289 blockxy minpos = res->pos;
1290 int dmin = INT_MAX;
1291 uint8_t *orgblk;
1292 int penalty;
1293 int init_search;
1294 int init_size;
1295 blockxy matchrec;
1296 int resvec[4];
1299 init_search = searched_size;
1300 init_size = engine->sub22_num_mcomps;
1301 for( k = 0; k < searched_size; ++k )
1303 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1304 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1305 orgblk = org + matchrec.x+lx*matchrec.y;
1306 penalty = abs(matchrec.x)+abs(matchrec.y);
1309 mblockq_dist1_mmxe(orgblk,blk,lx,h, resvec);
1311 for( i = 0; i < 4; ++i )
1313 if( matchrec.x <= xmax && matchrec.y <= ymax )
1316 d = penalty+resvec[i];
1317 if (d<dmin)
1319 dmin = d;
1320 minpos = matchrec;
1323 if( i == 1 )
1325 orgblk += lx-1;
1326 matchrec.x -= 1;
1327 matchrec.y += 1;
1329 else
1331 orgblk += 1;
1332 matchrec.x += 1;
1337 res->pos = minpos;
1338 res->blk = org + minpos.x+lx*minpos.y;
1339 res->sad = dmin;
1342 #endif
1345 * full search block matching
1347 * A.Stevens 2000: This is now a big misnomer. The search is now a hierarchical/sub-sampling
1348 * search not a full search. However, experiments have shown it is always close to
1349 * optimal and almost always very close or optimal.
1351 * blk: top left pel of (16*h) block
1352 * s22blk: top element of fast motion compensation block corresponding to blk
1353 * h: height of block
1354 * lx: distance (in bytes) of vertically adjacent pels in ref,blk
1355 * org: top left pel of source reference picture
1356 * ref: top left pel of reconstructed reference picture
1357 * i0,j0: center of search window
1358 * sx,sy: half widths of search window
1359 * xmax,ymax: right/bottom limits of search area
1360 * iminp,jminp: pointers to where the result is stored
1361 * result is given as half pel offset from ref(0,0)
1362 * i.e. NOT relative to (i0,j0)
1366 static void fullsearch(motion_engine_t *engine,
1367 uint8_t *org,
1368 uint8_t *ref,
1369 subsampled_mb_s *ssblk,
1370 int lx, int i0, int j0,
1371 int sx, int sy, int h,
1372 int xmax, int ymax,
1373 /* int *iminp, int *jminp, int *sadminp, */
1374 mb_motion_s *res
1377 mb_motion_s best;
1378 /* int imin, jmin, dmin */
1379 int i,j,ilow,ihigh,jlow,jhigh;
1380 int d;
1382 /* NOTE: Surprisingly, the initial motion compensation search
1383 works better when the original image not the reference (reconstructed)
1384 image is used.
1386 uint8_t *s22org = (uint8_t*)(org+fsubsample_offset);
1387 uint8_t *s44org = (uint8_t*)(org+qsubsample_offset);
1388 uint8_t *orgblk;
1389 int flx = lx >> 1;
1390 int qlx = lx >> 2;
1391 int fh = h >> 1;
1392 int qh = h >> 2;
1395 /* xmax and ymax into more useful form... */
1396 xmax -= 16;
1397 ymax -= h;
1400 /* The search radii are *always* multiples of 4 to avoid messiness
1401 in the initial 4*4 pel search. This is handled by the
1402 parameter checking/processing code in readparmfile() */
1404 /* Create a distance-order mcomps of possible motion compensations
1405 based on the fast estimation data - 4*4 pel sums (4*4
1406 sub-sampled) rather than actual pel's. 1/16 the size... */
1407 jlow = j0-sy;
1408 jlow = jlow < 0 ? 0 : jlow;
1409 jhigh = j0+(sy-1);
1410 jhigh = jhigh > ymax ? ymax : jhigh;
1411 ilow = i0-sx;
1412 ilow = ilow < 0 ? 0 : ilow;
1413 ihigh = i0+(sx-1);
1414 ihigh = ihigh > xmax ? xmax : ihigh;
1417 Very rarely this may fail to find matchs due to all the good
1418 looking ones being over threshold. hence we make sure we
1419 fall back to a 0 motion compensation in this case.
1421 The sad for the 0 motion compensation is also very useful as
1422 a basis for setting thresholds for rejecting really dud 4*4
1423 and 2*2 sub-sampled matches.
1425 best.sad = (*pdist1_00)(ref + i0 + j0 * lx,
1426 ssblk->mb,
1429 best.sad);
1430 best.pos.x = i0;
1431 best.pos.y = j0;
1433 engine->sub44_num_mcomps = build_sub44_mcomps(engine,
1434 ilow, jlow, ihigh, jhigh,
1435 i0, j0,
1436 best.sad,
1437 s44org,
1438 ssblk->qmb, qlx, qh );
1441 /* Now create a distance-ordered mcomps of possible motion
1442 compensations based on the fast estimation data - 2*2 pel sums
1443 using the best fraction of the 4*4 estimates However we cover
1444 only coarsely... on 4-pel boundaries... */
1446 engine->sub22_num_mcomps = (*pbuild_sub22_mcomps)( engine, i0, j0, ihigh, jhigh,
1447 best.sad,
1448 s22org, ssblk->fmb, flx, fh,
1449 engine->sub44_num_mcomps );
1452 /* Now choose best 1-pel match from what approximates (not exact
1453 due to the pre-processing trick with the mean) the top 1/2 of
1454 the 2*2 matches
1458 (*pfind_best_one_pel)( engine, ref, ssblk->mb, engine->sub22_num_mcomps,
1459 i0, j0,
1460 ilow, jlow, xmax, ymax,
1461 lx, h, &best );
1463 /* Final polish: half-pel search of best candidate against
1464 reconstructed image.
1467 best.pos.x <<= 1;
1468 best.pos.y <<= 1;
1469 best.hx = 0;
1470 best.hy = 0;
1472 ilow = best.pos.x - (best.pos.x>(ilow<<1));
1473 ihigh = best.pos.x + (best.pos.x<((ihigh)<<1));
1474 jlow = best.pos.y - (best.pos.y>(jlow<<1));
1475 jhigh = best.pos.y+ (best.pos.y<((jhigh)<<1));
1477 for (j=jlow; j<=jhigh; j++)
1479 for (i=ilow; i<=ihigh; i++)
1481 orgblk = ref+(i>>1)+((j>>1)*lx);
1482 if( i&1 )
1484 if( j & 1 )
1485 d = (*pdist1_11)(orgblk,ssblk->mb,lx,h);
1486 else
1487 d = (*pdist1_01)(orgblk,ssblk->mb,lx,h);
1489 else
1491 if( j & 1 )
1492 d = (*pdist1_10)(orgblk,ssblk->mb,lx,h);
1493 else
1494 d = (*pdist1_00)(orgblk,ssblk->mb,lx,h,best.sad);
1496 if (d<best.sad)
1498 best.sad = d;
1499 best.pos.x = i;
1500 best.pos.y = j;
1501 best.blk = orgblk;
1502 best.hx = i&1;
1503 best.hy = j&1;
1507 best.var = (*pdist2)(best.blk, ssblk->mb, lx, best.hx, best.hy, h);
1508 *res = best;
1512 * total absolute difference between two (16*h) blocks
1513 * including optional half pel interpolation of blk1 (hx,hy)
1514 * blk1,blk2: addresses of top left pels of both blocks
1515 * lx: distance (in bytes) of vertically adjacent pels
1516 * hx,hy: flags for horizontal and/or vertical interpolation
1517 * h: height of block (usually 8 or 16)
1518 * distlim: bail out if sum exceeds this value
1521 /* A.Stevens 2000: New version for highly pipelined CPUs where branching is
1522 costly. Really it sucks that C doesn't define a stdlib abs that could
1523 be realised as a compiler intrinsic using appropriate CPU instructions.
1524 That 1970's heritage...
1528 static int dist1_00(uint8_t *blk1,uint8_t *blk2,
1529 int lx, int h,int distlim)
1531 uint8_t *p1,*p2;
1532 int j;
1533 int s;
1534 register int v;
1536 s = 0;
1537 p1 = blk1;
1538 p2 = blk2;
1539 for (j=0; j<h; j++)
1542 #define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
1543 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1544 pipestep(4); pipestep(5); pipestep(6); pipestep(7);
1545 pipestep(8); pipestep(9); pipestep(10); pipestep(11);
1546 pipestep(12); pipestep(13); pipestep(14); pipestep(15);
1547 #undef pipestep
1549 if (s >= distlim)
1550 break;
1552 p1+= lx;
1553 p2+= lx;
1555 return s;
1558 static int dist1_01(uint8_t *blk1,uint8_t *blk2,
1559 int lx, int h)
1561 uint8_t *p1,*p2;
1562 int i,j;
1563 int s;
1564 register int v;
1566 s = 0;
1567 p1 = blk1;
1568 p2 = blk2;
1569 for (j=0; j<h; j++)
1571 for (i=0; i<16; i++)
1574 v = ((unsigned int)(p1[i]+p1[i+1])>>1) - p2[i];
1576 v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1);
1578 s+=abs(v);
1580 p1+= lx;
1581 p2+= lx;
1583 return s;
1586 static int dist1_10(uint8_t *blk1,uint8_t *blk2,
1587 int lx, int h)
1589 uint8_t *p1,*p1a,*p2;
1590 int i,j;
1591 int s;
1592 register int v;
1594 s = 0;
1595 p1 = blk1;
1596 p2 = blk2;
1597 p1a = p1 + lx;
1598 for (j=0; j<h; j++)
1600 for (i=0; i<16; i++)
1602 v = ((unsigned int)(p1[i]+p1a[i])>>1) - p2[i];
1603 s+=abs(v);
1605 p1 = p1a;
1606 p1a+= lx;
1607 p2+= lx;
1610 return s;
1613 static int dist1_11(uint8_t *blk1,uint8_t *blk2,
1614 int lx, int h)
1616 uint8_t *p1,*p1a,*p2;
1617 int i,j;
1618 int s;
1619 register int v;
1621 s = 0;
1622 p1 = blk1;
1623 p2 = blk2;
1624 p1a = p1 + lx;
1626 for (j=0; j<h; j++)
1628 for (i=0; i<16; i++)
1630 v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1])>>2) - p2[i];
1631 s+=abs(v);
1633 p1 = p1a;
1634 p1a+= lx;
1635 p2+= lx;
1637 return s;
1640 /* USED only during debugging...
1641 void check_fast_motion_data(uint8_t *blk, char *label )
1643 uint8_t *b, *nb;
1644 uint8_t *pb,*p;
1645 uint8_t *qb;
1646 uint8_t *start_s22blk, *start_s44blk;
1647 int i;
1648 unsigned int mismatch;
1649 int nextfieldline;
1650 start_s22blk = (blk+height*width);
1651 start_s44blk = (blk+height*width+(height>>1)*(width>>1));
1653 if (pict_struct==FRAME_PICTURE)
1655 nextfieldline = width;
1657 else
1659 nextfieldline = 2*width;
1662 mismatch = 0;
1663 pb = start_s22blk;
1664 qb = start_s44blk;
1665 p = blk;
1666 while( pb < qb )
1668 for( i = 0; i < nextfieldline/2; ++i )
1670 mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb;
1671 p += 2;
1672 ++pb;
1674 p += nextfieldline;
1676 if( mismatch )
1678 printf( "Mismatch detected check %s for buffer %08x\n", label, blk );
1679 exit(1);
1685 Append fast motion estimation data to original luminance
1686 data. N.b. memory allocation for luminance data allows space
1687 for this information...
1690 void fast_motion_data(uint8_t *blk, int picture_struct )
1692 uint8_t *b, *nb;
1693 uint8_t *pb;
1694 uint8_t *qb;
1695 uint8_t *start_s22blk, *start_s44blk;
1696 uint16_t *start_rowblk, *start_colblk;
1697 int i;
1698 int nextfieldline;
1699 #ifdef TEST_RCSEARCH
1700 uint16_t *pc, *pr,*p;
1701 int rowsum;
1702 int j,s;
1703 int down16 = width*16;
1704 uint16_t sums[32];
1705 uint16_t rowsums[2048];
1706 uint16_t colsums[2048]; /* TODO: BUG: should resize with width */
1707 #endif
1710 /* In an interlaced field the "next" line is 2 width's down
1711 rather than 1 width down */
1713 if (picture_struct==FRAME_PICTURE)
1715 nextfieldline = width;
1717 else
1719 nextfieldline = 2*width;
1722 start_s22blk = blk+fsubsample_offset;
1723 start_s44blk = blk+qsubsample_offset;
1724 start_rowblk = (uint16_t *)blk+rowsums_offset;
1725 start_colblk = (uint16_t *)blk+colsums_offset;
1726 b = blk;
1727 nb = (blk+nextfieldline);
1728 /* Sneaky stuff here... we can do lines in both fields at once */
1729 pb = (uint8_t *) start_s22blk;
1731 while( nb < start_s22blk )
1733 for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
1735 /* TODO: A.Stevens this has to be the most word-length dependent
1736 code in the world. Better than MMX assembler though I guess... */
1737 pb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1738 pb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1739 pb += 2;
1740 b += 4;
1741 nb += 4;
1743 b += nextfieldline;
1744 nb = b + nextfieldline;
1748 /* Now create the 4*4 sub-sampled data from the 2*2
1749 N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
1750 original. Albeit half as many lines and pixels...
1753 nextfieldline = nextfieldline >> 1;
1755 qb = start_s44blk;
1756 b = start_s22blk;
1757 nb = (start_s22blk+nextfieldline);
1759 while( nb < start_s44blk )
1761 for( i = 0; i < nextfieldline/4; ++i )
1763 /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */
1764 qb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1765 qb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1766 qb += 2;
1767 b += 4;
1768 nb += 4;
1770 b += nextfieldline;
1771 nb = b + nextfieldline;
1774 #ifdef TEST_RCSEARCH
1775 /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */
1778 Initial row sums....
1780 pb = blk;
1781 for(j = 0; j < height; ++j )
1783 rowsum = 0;
1784 for( i = 0; i < 16; ++ i )
1786 rowsum += pb[i];
1788 rowsums[j] = rowsum;
1789 pb += width;
1793 Initial column sums
1795 for( i = 0; i < width; ++i )
1797 colsums[i] = 0;
1799 pb = blk;
1800 for( j = 0; j < 16; ++j )
1802 for( i = 0; i < width; ++i )
1804 colsums[i] += *pb;
1805 ++pb;
1809 /* Now fill in the row/column sum tables...
1810 Note: to allow efficient construction of sum/col differences for a
1811 given position row sums are held in a *column major* array
1812 (changing y co-ordinate makes for small index changes)
1813 the col sums are held in a *row major* array
1816 pb = blk;
1817 pc = start_colblk;
1818 for(j = 0; j <32; ++j )
1820 pr = start_rowblk;
1821 rowsum = rowsums[j];
1822 for( i = 0; i < width-16; ++i )
1824 pc[i] = colsums[i];
1825 pr[j] = rowsum;
1826 colsums[i] = (colsums[i] + pb[down16] )-pb[0];
1827 rowsum = (rowsum + pb[16]) - pb[0];
1828 ++pb;
1829 pr += height;
1831 pb += 16; /* move pb on to next row... rememember we only did width-16! */
1832 pc += width;
1834 #endif
1838 static int dist22( uint8_t *s22blk1, uint8_t *s22blk2,int flx,int fh)
1840 uint8_t *p1 = s22blk1;
1841 uint8_t *p2 = s22blk2;
1842 int s = 0;
1843 int j;
1845 for( j = 0; j < fh; ++j )
1847 register int diff;
1848 #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
1849 pipestep(0); pipestep(1);
1850 pipestep(2); pipestep(3);
1851 pipestep(4); pipestep(5);
1852 pipestep(6); pipestep(7);
1853 p1 += flx;
1854 p2 += flx;
1855 #undef pipestep
1858 return s;
1864 Sum absolute differences for 4*4 sub-sampled data.
1866 TODO: currently assumes only 16*16 or 16*8 motion compensation will be used...
1867 I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
1871 static int dist44( uint8_t *s44blk1, uint8_t *s44blk2,int qlx,int qh)
1873 register uint8_t *p1 = s44blk1;
1874 register uint8_t *p2 = s44blk2;
1875 int s = 0;
1876 register int diff;
1878 /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
1879 #define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
1880 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1881 if( qh > 1 )
1883 p1 += qlx; p2 += qlx;
1884 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1885 if( qh > 2 )
1887 p1 += qlx; p2 += qlx;
1888 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1889 p1 += qlx; p2 += qlx;
1890 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1895 return s;
1899 * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
1900 * blk1,blk2: addresses of top left pels of both blocks
1901 * lx: distance (in bytes) of vertically adjacent pels
1902 * h: height of block (usually 8 or 16)
1905 static int dist2_22(uint8_t *blk1, uint8_t *blk2, int lx, int h)
1907 uint8_t *p1 = blk1, *p2 = blk2;
1908 int i,j,v;
1909 int s = 0;
1910 for (j=0; j<h; j++)
1912 for (i=0; i<8; i++)
1914 v = p1[i] - p2[i];
1915 s+= v*v;
1917 p1+= lx;
1918 p2+= lx;
1920 return s;
1923 /* total squared difference between bidirection prediction of (8*h)
1924 * blocks of 2*2 sub-sampled pels and reference
1925 * blk1f, blk1b,blk2: addresses of top left
1926 * pels of blocks
1927 * lx: distance (in bytes) of vertically adjacent
1928 * pels
1929 * h: height of block (usually 4 or 8)
1932 static int bdist2_22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2,
1933 int lx, int h)
1935 uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
1936 int i,j,v;
1937 int s = 0;
1938 for (j=0; j<h; j++)
1940 for (i=0; i<8; i++)
1942 v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
1943 s+= v*v;
1945 p1f+= lx;
1946 p1b+= lx;
1947 p2+= lx;
1949 return s;
1953 * total squared difference between two (16*h) blocks
1954 * including optional half pel interpolation of blk1 (hx,hy)
1955 * blk1,blk2: addresses of top left pels of both blocks
1956 * lx: distance (in bytes) of vertically adjacent pels
1957 * hx,hy: flags for horizontal and/or vertical interpolation
1958 * h: height of block (usually 8 or 16)
1962 static int dist2(blk1,blk2,lx,hx,hy,h)
1963 uint8_t *blk1,*blk2;
1964 int lx,hx,hy,h;
1966 uint8_t *p1,*p1a,*p2;
1967 int i,j;
1968 int s,v;
1970 s = 0;
1971 p1 = blk1;
1972 p2 = blk2;
1973 if (!hx && !hy)
1974 for (j=0; j<h; j++)
1976 for (i=0; i<16; i++)
1978 v = p1[i] - p2[i];
1979 s+= v*v;
1981 p1+= lx;
1982 p2+= lx;
1984 else if (hx && !hy)
1985 for (j=0; j<h; j++)
1987 for (i=0; i<16; i++)
1989 v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
1990 s+= v*v;
1992 p1+= lx;
1993 p2+= lx;
1995 else if (!hx && hy)
1997 p1a = p1 + lx;
1998 for (j=0; j<h; j++)
2000 for (i=0; i<16; i++)
2002 v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
2003 s+= v*v;
2005 p1 = p1a;
2006 p1a+= lx;
2007 p2+= lx;
2010 else /* if (hx && hy) */
2012 p1a = p1 + lx;
2013 for (j=0; j<h; j++)
2015 for (i=0; i<16; i++)
2017 v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
2018 s+= v*v;
2020 p1 = p1a;
2021 p1a+= lx;
2022 p2+= lx;
2026 return s;
2031 * absolute difference error between a (16*h) block and a bidirectional
2032 * prediction
2034 * p2: address of top left pel of block
2035 * pf,hxf,hyf: address and half pel flags of forward ref. block
2036 * pb,hxb,hyb: address and half pel flags of backward ref. block
2037 * h: height of block
2038 * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2042 static int bdist1(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2043 uint8_t *pf,*pb,*p2;
2044 int lx,hxf,hyf,hxb,hyb,h;
2046 uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2047 int i,j;
2048 int s,v;
2050 pfa = pf + hxf;
2051 pfb = pf + lx*hyf;
2052 pfc = pfb + hxf;
2054 pba = pb + hxb;
2055 pbb = pb + lx*hyb;
2056 pbc = pbb + hxb;
2058 s = 0;
2060 for (j=0; j<h; j++)
2062 for (i=0; i<16; i++)
2064 v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2065 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2066 - *p2++;
2067 s += abs(v);
2069 p2+= lx-16;
2070 pf+= lx-16;
2071 pfa+= lx-16;
2072 pfb+= lx-16;
2073 pfc+= lx-16;
2074 pb+= lx-16;
2075 pba+= lx-16;
2076 pbb+= lx-16;
2077 pbc+= lx-16;
2080 return s;
2084 * squared error between a (16*h) block and a bidirectional
2085 * prediction
2087 * p2: address of top left pel of block
2088 * pf,hxf,hyf: address and half pel flags of forward ref. block
2089 * pb,hxb,hyb: address and half pel flags of backward ref. block
2090 * h: height of block
2091 * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2095 static int bdist2(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2096 uint8_t *pf,*pb,*p2;
2097 int lx,hxf,hyf,hxb,hyb,h;
2099 uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2100 int i,j;
2101 int s,v;
2103 pfa = pf + hxf;
2104 pfb = pf + lx*hyf;
2105 pfc = pfb + hxf;
2107 pba = pb + hxb;
2108 pbb = pb + lx*hyb;
2109 pbc = pbb + hxb;
2111 s = 0;
2113 for (j=0; j<h; j++)
2115 for (i=0; i<16; i++)
2117 v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2118 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2119 - *p2++;
2120 s+=v*v;
2122 p2+= lx-16;
2123 pf+= lx-16;
2124 pfa+= lx-16;
2125 pfb+= lx-16;
2126 pfc+= lx-16;
2127 pb+= lx-16;
2128 pba+= lx-16;
2129 pbb+= lx-16;
2130 pbc+= lx-16;
2133 return s;
2138 * variance of a (size*size) block, multiplied by 256
2139 * p: address of top left pel of block
2140 * lx: distance (in bytes) of vertically adjacent pels
2142 static int variance(uint8_t *p, int size, int lx)
2144 int i,j;
2145 unsigned int v,s,s2;
2147 s = s2 = 0;
2149 for (j=0; j<size; j++)
2151 for (i=0; i<size; i++)
2153 v = *p++;
2154 s+= v;
2155 s2+= v*v;
2157 p+= lx-size;
2159 return s2 - (s*s)/(size*size);
2163 Compute the variance of the residual of uni-directionally motion
2164 compensated block.
2167 static int unidir_pred_var( const mb_motion_s *motion,
2168 uint8_t *mb,
2169 int lx,
2170 int h)
2172 return (*pdist2)(motion->blk, mb, lx, motion->hx, motion->hy, h);
2177 Compute the variance of the residual of bi-directionally motion
2178 compensated block.
2181 static int bidir_pred_var( const mb_motion_s *motion_f,
2182 const mb_motion_s *motion_b,
2183 uint8_t *mb,
2184 int lx, int h)
2186 return (*pbdist2)( motion_f->blk, motion_b->blk,
2187 mb, lx,
2188 motion_f->hx, motion_f->hy,
2189 motion_b->hx, motion_b->hy,
2194 Compute SAD for bi-directionally motion compensated blocks...
2197 static int bidir_pred_sad( const mb_motion_s *motion_f,
2198 const mb_motion_s *motion_b,
2199 uint8_t *mb,
2200 int lx, int h)
2202 return (*pbdist1)(motion_f->blk, motion_b->blk,
2203 mb, lx,
2204 motion_f->hx, motion_f->hy,
2205 motion_b->hx, motion_b->hy,
2209 static void frame_ME(motion_engine_t *engine,
2210 pict_data_s *picture,
2211 motion_comp_s *mc,
2212 int mb_row_start,
2213 int i, int j,
2214 mbinfo_s *mbi)
2216 mb_motion_s framef_mc;
2217 mb_motion_s frameb_mc;
2218 mb_motion_s dualpf_mc;
2219 mb_motion_s topfldf_mc;
2220 mb_motion_s botfldf_mc;
2221 mb_motion_s topfldb_mc;
2222 mb_motion_s botfldb_mc;
2224 int var,v0;
2225 int vmc,vmcf,vmcr,vmci;
2226 int vmcfieldf,vmcfieldr,vmcfieldi;
2227 subsampled_mb_s ssmb;
2228 int imins[2][2],jmins[2][2];
2229 int imindmv,jmindmv,vmc_dp;
2230 //printf("frame_ME 1\n");
2233 /* A.Stevens fast motion estimation data is appended to actual
2234 luminance information
2236 ssmb.mb = mc->cur[0] + mb_row_start + i;
2237 ssmb.umb = (uint8_t*)(mc->cur[1] + (i>>1) + (mb_row_start>>2));
2238 ssmb.vmb = (uint8_t*)(mc->cur[2] + (i>>1) + (mb_row_start>>2));
2239 ssmb.fmb = (uint8_t*)(mc->cur[0] + fsubsample_offset +
2240 ((i>>1) + (mb_row_start>>2)));
2241 ssmb.qmb = (uint8_t*)(mc->cur[0] + qsubsample_offset +
2242 (i>>2) + (mb_row_start>>4));
2244 /* Compute variance MB as a measure of Intra-coding complexity
2245 We include chrominance information here, scaled to compensate
2246 for sub-sampling. Silly MPEG forcing chrom/lum to have same
2247 quantisations...
2249 var = variance(ssmb.mb,16,width);
2251 //printf("motion %d\n", picture->pict_type);
2252 if (picture->pict_type==I_TYPE)
2254 mbi->mb_type = MB_INTRA;
2256 else if (picture->pict_type==P_TYPE)
2258 /* For P pictures we take into account chrominance. This
2259 provides much better performance at scene changes */
2260 var += chrom_var_sum(&ssmb,16,width);
2262 if (picture->frame_pred_dct)
2264 fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2265 width,i,j,mc->sxf,mc->syf,16,width,height,
2266 &framef_mc);
2267 framef_mc.fieldoff = 0;
2268 vmc = framef_mc.var +
2269 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2270 mbi->motion_type = MC_FRAME;
2272 else
2274 //printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]);
2275 frame_estimate(engine, mc->oldorg[0],
2276 mc->oldref[0],
2277 &ssmb,
2280 mc->sxf,
2281 mc->syf,
2282 &framef_mc,
2283 &topfldf_mc,
2284 &botfldf_mc,
2285 imins,
2286 jmins);
2287 //printf("frame_ME 3\n");
2288 vmcf = framef_mc.var +
2289 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2290 vmcfieldf =
2291 topfldf_mc.var +
2292 unidir_chrom_var_sum( &topfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ) +
2293 botfldf_mc.var +
2294 unidir_chrom_var_sum( &botfldf_mc, mc->oldref, &ssmb, (width<<1), 8 );
2295 /* DEBUG DP currently disabled... */
2296 // if ( M==1)
2297 // {
2298 // dpframe_estimate(engine, picture,mc->oldref[0],&ssmb,
2299 // i,j>>1,imins,jmins,
2300 // &dualpf_mc,
2301 // &imindmv,&jmindmv, &vmc_dp);
2302 // }
2304 /* NOTE: Typically M =3 so DP actually disabled... */
2305 /* select between dual prime, frame and field prediction */
2306 // if ( M==1 && vmc_dp<vmcf && vmc_dp<vmcfieldf)
2307 // {
2308 // mbi->motion_type = MC_DMV;
2309 // /* No chrominance squared difference measure yet.
2310 // Assume identical to luminance */
2311 // vmc = vmc_dp + vmc_dp;
2312 // }
2313 // else
2314 if ( vmcf < vmcfieldf)
2316 mbi->motion_type = MC_FRAME;
2317 vmc = vmcf;
2320 else
2322 mbi->motion_type = MC_FIELD;
2323 vmc = vmcfieldf;
2328 /* select between intra or non-intra coding:
2330 * selection is based on intra block variance (var) vs.
2331 * prediction error variance (vmc)
2333 * Used to be: blocks with small prediction error are always
2334 * coded non-intra even if variance is smaller (is this reasonable?
2336 * TODO: A.Stevens Jul 2000
2337 * The bbmpeg guys have found this to be *unreasonable*.
2338 * I'm not sure I buy their solution using vmc*2. It is probabably
2339 * the vmc>= 9*256 test that is suspect.
2344 if (vmc>var /*&& vmc>=(3*3)*16*16*2*/ )
2346 mbi->mb_type = MB_INTRA;
2347 mbi->var = var;
2350 else
2352 /* select between MC / No-MC
2354 * use No-MC if var(No-MC) <= 1.25*var(MC)
2355 * (i.e slightly biased towards No-MC)
2357 * blocks with small prediction error are always coded as No-MC
2358 * (requires no motion vectors, allows skipping)
2360 v0 = (*pdist2)(mc->oldref[0]+i+width*j,ssmb.mb,width,0,0,16);
2362 if (4*v0>5*vmc )
2364 /* use MC */
2365 var = vmc;
2366 mbi->mb_type = MB_FORWARD;
2367 if (mbi->motion_type==MC_FRAME)
2369 mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2370 mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2372 else if (mbi->motion_type==MC_DMV)
2374 /* these are FRAME vectors */
2375 /* same parity vector */
2376 mbi->MV[0][0][0] = dualpf_mc.pos.x - (i<<1);
2377 mbi->MV[0][0][1] = (dualpf_mc.pos.y<<1) - (j<<1);
2379 /* opposite parity vector */
2380 mbi->dmvector[0] = imindmv;
2381 mbi->dmvector[1] = jmindmv;
2383 else
2385 /* these are FRAME vectors */
2386 mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2387 mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2388 mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2389 mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2390 mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2391 mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2394 else
2397 /* No-MC */
2398 var = v0;
2399 mbi->mb_type = 0;
2400 mbi->motion_type = MC_FRAME;
2401 mbi->MV[0][0][0] = 0;
2402 mbi->MV[0][0][1] = 0;
2406 else /* if (pict_type==B_TYPE) */
2408 if (picture->frame_pred_dct)
2410 var = variance(ssmb.mb,16,width);
2411 /* forward */
2412 fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2413 width,i,j,mc->sxf,mc->syf,
2414 16,width,height,
2415 &framef_mc
2417 framef_mc.fieldoff = 0;
2418 vmcf = framef_mc.var;
2420 /* backward */
2421 fullsearch(engine, mc->neworg[0],mc->newref[0],&ssmb,
2422 width,i,j,mc->sxb,mc->syb,
2423 16,width,height,
2424 &frameb_mc);
2425 frameb_mc.fieldoff = 0;
2426 vmcr = frameb_mc.var;
2428 /* interpolated (bidirectional) */
2430 vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2432 /* decisions */
2434 /* select between forward/backward/interpolated prediction:
2435 * use the one with smallest mean sqaured prediction error
2437 if (vmcf<=vmcr && vmcf<=vmci)
2439 vmc = vmcf;
2440 mbi->mb_type = MB_FORWARD;
2442 else if (vmcr<=vmci)
2444 vmc = vmcr;
2445 mbi->mb_type = MB_BACKWARD;
2447 else
2449 vmc = vmci;
2450 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2453 mbi->motion_type = MC_FRAME;
2455 else
2457 /* forward prediction */
2458 frame_estimate(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2459 i,j,mc->sxf,mc->syf,
2460 &framef_mc,
2461 &topfldf_mc,
2462 &botfldf_mc,
2463 imins,jmins);
2466 /* backward prediction */
2467 frame_estimate(engine, mc->neworg[0],mc->newref[0],&ssmb,
2468 i,j,mc->sxb,mc->syb,
2469 &frameb_mc,
2470 &topfldb_mc,
2471 &botfldb_mc,
2472 imins,jmins);
2474 vmcf = framef_mc.var;
2475 vmcr = frameb_mc.var;
2476 vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2478 vmcfieldf = topfldf_mc.var + botfldf_mc.var;
2479 vmcfieldr = topfldb_mc.var + botfldb_mc.var;
2480 vmcfieldi = bidir_pred_var( &topfldf_mc, &topfldb_mc, ssmb.mb,
2481 width<<1, 8) +
2482 bidir_pred_var( &botfldf_mc, &botfldb_mc, ssmb.mb,
2483 width<<1, 8);
2485 /* select prediction type of minimum distance from the
2486 * six candidates (field/frame * forward/backward/interpolated)
2488 if (vmci<vmcfieldi && vmci<vmcf && vmci<vmcfieldf
2489 && vmci<vmcr && vmci<vmcfieldr)
2491 /* frame, interpolated */
2492 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2493 mbi->motion_type = MC_FRAME;
2494 vmc = vmci;
2496 else if ( vmcfieldi<vmcf && vmcfieldi<vmcfieldf
2497 && vmcfieldi<vmcr && vmcfieldi<vmcfieldr)
2499 /* field, interpolated */
2500 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2501 mbi->motion_type = MC_FIELD;
2502 vmc = vmcfieldi;
2504 else if (vmcf<vmcfieldf && vmcf<vmcr && vmcf<vmcfieldr)
2506 /* frame, forward */
2507 mbi->mb_type = MB_FORWARD;
2508 mbi->motion_type = MC_FRAME;
2509 vmc = vmcf;
2512 else if ( vmcfieldf<vmcr && vmcfieldf<vmcfieldr)
2514 /* field, forward */
2515 mbi->mb_type = MB_FORWARD;
2516 mbi->motion_type = MC_FIELD;
2517 vmc = vmcfieldf;
2519 else if (vmcr<vmcfieldr)
2521 /* frame, backward */
2522 mbi->mb_type = MB_BACKWARD;
2523 mbi->motion_type = MC_FRAME;
2524 vmc = vmcr;
2526 else
2528 /* field, backward */
2529 mbi->mb_type = MB_BACKWARD;
2530 mbi->motion_type = MC_FIELD;
2531 vmc = vmcfieldr;
2536 /* select between intra or non-intra coding:
2538 * selection is based on intra block variance (var) vs.
2539 * prediction error variance (vmc)
2541 * Used to be: blocks with small prediction error are always
2542 * coded non-intra even if variance is smaller (is this reasonable?
2544 * TODO: A.Stevens Jul 2000
2545 * The bbmpeg guys have found this to be *unreasonable*.
2546 * I'm not sure I buy their solution using vmc*2 in the first comparison.
2547 * It is probabably the vmc>= 9*256 test that is suspect.
2550 if (vmc>var && vmc>=9*256)
2551 mbi->mb_type = MB_INTRA;
2552 else
2554 var = vmc;
2555 if (mbi->motion_type==MC_FRAME)
2557 /* forward */
2558 mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2559 mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2560 /* backward */
2561 mbi->MV[0][1][0] = frameb_mc.pos.x - (i<<1);
2562 mbi->MV[0][1][1] = frameb_mc.pos.y - (j<<1);
2564 else
2566 /* these are FRAME vectors */
2567 /* forward */
2568 mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2569 mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2570 mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2571 mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2572 mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2573 mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2574 /* backward */
2575 mbi->MV[0][1][0] = topfldb_mc.pos.x - (i<<1);
2576 mbi->MV[0][1][1] = (topfldb_mc.pos.y<<1) - (j<<1);
2577 mbi->MV[1][1][0] = botfldb_mc.pos.x - (i<<1);
2578 mbi->MV[1][1][1] = (botfldb_mc.pos.y<<1) - (j<<1);
2579 mbi->mv_field_sel[0][1] = topfldb_mc.fieldsel;
2580 mbi->mv_field_sel[1][1] = botfldb_mc.fieldsel;
2589 * motion estimation for field pictures
2591 * mbi: pointer to macroblock info structure
2592 * secondfield: indicates second field of a frame (in P fields this means
2593 * that reference field of opposite parity is in curref instead
2594 * of oldref)
2595 * ipflag: indicates a P type field which is the second field of a frame
2596 * in which the first field is I type (this restricts predictions
2597 * to be based only on the opposite parity (=I) field)
2599 * results:
2600 * mbi->
2601 * mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD
2602 * MV[][][]: motion vectors (field format)
2603 * mv_field_sel: top/bottom field
2604 * motion_type: MC_FIELD, MC_16X8
2606 * uses global vars: pict_type, pict_struct
2608 static void field_ME(motion_engine_t *engine,
2609 pict_data_s *picture,
2610 motion_comp_s *mc,
2611 int mb_row_start,
2612 int i, int j,
2613 mbinfo_s *mbi,
2614 int secondfield, int ipflag)
2616 int w2;
2617 uint8_t *toporg, *topref, *botorg, *botref;
2618 subsampled_mb_s ssmb;
2619 mb_motion_s fields_mc, dualp_mc;
2620 mb_motion_s fieldf_mc, fieldb_mc;
2621 mb_motion_s field8uf_mc, field8lf_mc;
2622 mb_motion_s field8ub_mc, field8lb_mc;
2623 int var,vmc,v0,dmc,dmcfieldi,dmcfield,dmcfieldf,dmcfieldr,dmc8i;
2624 int imins,jmins;
2625 int dmc8f,dmc8r;
2626 int vmc_dp,dmc_dp;
2628 w2 = width<<1;
2630 /* Fast motion data sits at the end of the luminance buffer */
2631 ssmb.mb = mc->cur[0] + i + w2*j;
2632 ssmb.umb = mc->cur[1] + ((i>>1)+(w2>>1)*(j>>1));
2633 ssmb.vmb = mc->cur[2] + ((i>>1)+(w2>>1)*(j>>1));
2634 ssmb.fmb = mc->cur[0] + fsubsample_offset+((i>>1)+(w2>>1)*(j>>1));
2635 ssmb.qmb = mc->cur[0] + qsubsample_offset+ (i>>2)+(w2>>2)*(j>>2);
2637 if (picture->pict_struct==BOTTOM_FIELD)
2639 ssmb.mb += width;
2640 ssmb.umb += (width >> 1);
2641 ssmb.vmb += (width >> 1);
2642 ssmb.fmb += (width >> 1);
2643 ssmb.qmb += (width >> 2);
2646 var = variance(ssmb.mb,16,w2) +
2647 ( variance(ssmb.umb,8,(width>>1)) + variance(ssmb.vmb,8,(width>>1)))*2;
2649 if (picture->pict_type==I_TYPE)
2650 mbi->mb_type = MB_INTRA;
2651 else if (picture->pict_type==P_TYPE)
2653 toporg = mc->oldorg[0];
2654 topref = mc->oldref[0];
2655 botorg = mc->oldorg[0] + width;
2656 botref = mc->oldref[0] + width;
2658 if (secondfield)
2660 /* opposite parity field is in same frame */
2661 if (picture->pict_struct==TOP_FIELD)
2663 /* current is top field */
2664 botorg = mc->cur[0] + width;
2665 botref = mc->curref[0] + width;
2667 else
2669 /* current is bottom field */
2670 toporg = mc->cur[0];
2671 topref = mc->curref[0];
2675 field_estimate(engine,
2676 picture,
2677 toporg,topref,botorg,botref,&ssmb,
2678 i,j,mc->sxf,mc->syf,ipflag,
2679 &fieldf_mc,
2680 &field8uf_mc,
2681 &field8lf_mc,
2682 &fields_mc);
2683 dmcfield = fieldf_mc.sad;
2684 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2686 // if (M==1 && !ipflag) /* generic condition which permits Dual Prime */
2687 // {
2688 // dpfield_estimate(engine,
2689 // picture,
2690 // topref,botref,ssmb.mb,i,j,imins,jmins,
2691 // &dualp_mc,
2692 // &vmc_dp);
2693 // dmc_dp = dualp_mc.sad;
2694 // }
2696 // /* select between dual prime, field and 16x8 prediction */
2697 // if (M==1 && !ipflag && dmc_dp<dmc8f && dmc_dp<dmcfield)
2698 // {
2699 // /* Dual Prime prediction */
2700 // mbi->motion_type = MC_DMV;
2701 // dmc = dualp_mc.sad;
2702 // vmc = vmc_dp;
2704 // }
2705 // else
2706 if (dmc8f<dmcfield)
2708 /* 16x8 prediction */
2709 mbi->motion_type = MC_16X8;
2710 /* upper and lower half blocks */
2711 vmc = unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2712 vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2714 else
2716 /* field prediction */
2717 mbi->motion_type = MC_FIELD;
2718 vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16 );
2721 /* select between intra and non-intra coding */
2723 if ( vmc>var && vmc>=9*256)
2724 mbi->mb_type = MB_INTRA;
2725 else
2727 /* zero MV field prediction from same parity ref. field
2728 * (not allowed if ipflag is set)
2730 if (!ipflag)
2731 v0 = (*pdist2)(((picture->pict_struct==BOTTOM_FIELD)?botref:topref) + i + w2*j,
2732 ssmb.mb,w2,0,0,16);
2733 if (ipflag || (4*v0>5*vmc && v0>=9*256))
2735 var = vmc;
2736 mbi->mb_type = MB_FORWARD;
2737 if (mbi->motion_type==MC_FIELD)
2739 mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2740 mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2741 mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2743 else if (mbi->motion_type==MC_DMV)
2745 /* same parity vector */
2746 mbi->MV[0][0][0] = imins - (i<<1);
2747 mbi->MV[0][0][1] = jmins - (j<<1);
2749 /* opposite parity vector */
2750 mbi->dmvector[0] = dualp_mc.pos.x;
2751 mbi->dmvector[1] = dualp_mc.pos.y;
2753 else
2755 mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2756 mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2757 mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2758 mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2759 mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2760 mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2763 else
2765 /* No MC */
2766 var = v0;
2767 mbi->mb_type = 0;
2768 mbi->motion_type = MC_FIELD;
2769 mbi->MV[0][0][0] = 0;
2770 mbi->MV[0][0][1] = 0;
2771 mbi->mv_field_sel[0][0] = (picture->pict_struct==BOTTOM_FIELD);
2775 else /* if (pict_type==B_TYPE) */
2777 /* forward prediction */
2778 field_estimate(engine,
2779 picture,
2780 mc->oldorg[0],mc->oldref[0],
2781 mc->oldorg[0]+width,mc->oldref[0]+width,&ssmb,
2782 i,j,mc->sxf,mc->syf,0,
2783 &fieldf_mc,
2784 &field8uf_mc,
2785 &field8lf_mc,
2786 &fields_mc);
2787 dmcfieldf = fieldf_mc.sad;
2788 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2790 /* backward prediction */
2791 field_estimate(engine,
2792 picture,
2793 mc->neworg[0],mc->newref[0],mc->neworg[0]+width,mc->newref[0]+width,
2794 &ssmb,
2795 i,j,mc->sxb,mc->syb,0,
2796 &fieldb_mc,
2797 &field8ub_mc,
2798 &field8lb_mc,
2799 &fields_mc);
2800 dmcfieldr = fieldb_mc.sad;
2801 dmc8r = field8ub_mc.sad + field8lb_mc.sad;
2803 /* calculate distances for bidirectional prediction */
2804 /* field */
2805 dmcfieldi = bidir_pred_sad( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2807 /* 16x8 upper and lower half blocks */
2808 dmc8i = bidir_pred_sad( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 16 );
2809 dmc8i += bidir_pred_sad( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 16 );
2811 /* select prediction type of minimum distance */
2812 if (dmcfieldi<dmc8i && dmcfieldi<dmcfieldf && dmcfieldi<dmc8f
2813 && dmcfieldi<dmcfieldr && dmcfieldi<dmc8r)
2815 /* field, interpolated */
2816 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2817 mbi->motion_type = MC_FIELD;
2818 vmc = bidir_pred_var( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2820 else if (dmc8i<dmcfieldf && dmc8i<dmc8f
2821 && dmc8i<dmcfieldr && dmc8i<dmc8r)
2823 /* 16x8, interpolated */
2824 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2825 mbi->motion_type = MC_16X8;
2827 /* upper and lower half blocks */
2828 vmc = bidir_pred_var( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 8);
2829 vmc += bidir_pred_var( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 8);
2831 else if (dmcfieldf<dmc8f && dmcfieldf<dmcfieldr && dmcfieldf<dmc8r)
2833 /* field, forward */
2834 mbi->mb_type = MB_FORWARD;
2835 mbi->motion_type = MC_FIELD;
2836 vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16);
2838 else if (dmc8f<dmcfieldr && dmc8f<dmc8r)
2840 /* 16x8, forward */
2841 mbi->mb_type = MB_FORWARD;
2842 mbi->motion_type = MC_16X8;
2844 /* upper and lower half blocks */
2845 vmc = unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2846 vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2848 else if (dmcfieldr<dmc8r)
2850 /* field, backward */
2851 mbi->mb_type = MB_BACKWARD;
2852 mbi->motion_type = MC_FIELD;
2853 vmc = unidir_pred_var( &fieldb_mc, ssmb.mb, w2, 16 );
2855 else
2857 /* 16x8, backward */
2858 mbi->mb_type = MB_BACKWARD;
2859 mbi->motion_type = MC_16X8;
2861 /* upper and lower half blocks */
2862 vmc = unidir_pred_var( &field8ub_mc, ssmb.mb, w2, 8);
2863 vmc += unidir_pred_var( &field8lb_mc, ssmb.mb, w2, 8);
2867 /* select between intra and non-intra coding */
2868 if (vmc>var && vmc>=9*256)
2869 mbi->mb_type = MB_INTRA;
2870 else
2872 var = vmc;
2873 if (mbi->motion_type==MC_FIELD)
2875 /* forward */
2876 mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2877 mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2878 mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2879 /* backward */
2880 mbi->MV[0][1][0] = fieldb_mc.pos.x - (i<<1);
2881 mbi->MV[0][1][1] = fieldb_mc.pos.y - (j<<1);
2882 mbi->mv_field_sel[0][1] = fieldb_mc.fieldsel;
2884 else /* MC_16X8 */
2886 /* forward */
2887 mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2888 mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2889 mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2890 mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2891 mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2892 mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2893 /* backward */
2894 mbi->MV[0][1][0] = field8ub_mc.pos.x - (i<<1);
2895 mbi->MV[0][1][1] = field8ub_mc.pos.y - (j<<1);
2896 mbi->mv_field_sel[0][1] = field8ub_mc.fieldsel;
2897 mbi->MV[1][1][0] = field8lb_mc.pos.x - (i<<1);
2898 mbi->MV[1][1][1] = field8lb_mc.pos.y - ((j+8)<<1);
2899 mbi->mv_field_sel[1][1] = field8lb_mc.fieldsel;
2909 Initialise motion compensation - currently purely selection of which
2910 versions of the various low level computation routines to use
2914 void init_motion()
2916 int cpucap = cpu_accel();
2918 if( cpucap == 0 ) /* No MMX/SSE etc support available */
2920 pdist22 = dist22;
2921 pdist44 = dist44;
2922 pdist1_00 = dist1_00;
2923 pdist1_01 = dist1_01;
2924 pdist1_10 = dist1_10;
2925 pdist1_11 = dist1_11;
2926 pbdist1 = bdist1;
2927 pdist2 = dist2;
2928 pbdist2 = bdist2;
2929 pdist2_22 = dist2_22;
2930 pbdist2_22 = bdist2_22;
2931 pfind_best_one_pel = find_best_one_pel;
2932 pbuild_sub22_mcomps = build_sub22_mcomps;
2934 #ifdef X86_CPU
2935 else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
2937 if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for MOTION!\n");
2938 pdist22 = dist22_mmxe;
2939 pdist44 = dist44_mmxe;
2940 pdist1_00 = dist1_00_mmxe;
2941 pdist1_01 = dist1_01_mmxe;
2942 pdist1_10 = dist1_10_mmxe;
2943 pdist1_11 = dist1_11_mmxe;
2944 pbdist1 = bdist1_mmx;
2945 pdist2 = dist2_mmx;
2946 pbdist2 = bdist2_mmx;
2947 pdist2_22 = dist2_22_mmx;
2948 pbdist2_22 = bdist2_22_mmx;
2949 pfind_best_one_pel = find_best_one_pel_mmxe;
2950 pbuild_sub22_mcomps = build_sub22_mcomps_mmxe;
2951 pmblock_sub44_dists = mblock_sub44_dists_mmxe;
2954 else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
2956 if(verbose) fprintf( stderr, "SETTING MMX for MOTION!\n");
2957 pdist22 = dist22_mmx;
2958 pdist44 = dist44_mmx;
2959 pdist1_00 = dist1_00_mmx;
2960 pdist1_01 = dist1_01_mmx;
2961 pdist1_10 = dist1_10_mmx;
2962 pdist1_11 = dist1_11_mmx;
2963 pbdist1 = bdist1_mmx;
2964 pdist2 = dist2_mmx;
2965 pbdist2 = bdist2_mmx;
2966 pdist2_22 = dist2_22_mmx;
2967 pbdist2_22 = bdist2_22_mmx;
2968 pfind_best_one_pel = find_best_one_pel;
2969 pbuild_sub22_mcomps = build_sub22_mcomps;
2970 pmblock_sub44_dists = mblock_sub44_dists_mmx;
2972 #endif
2976 void motion_engine_loop(motion_engine_t *engine)
2978 while(!engine->done)
2980 pthread_mutex_lock(&(engine->input_lock));
2982 if(!engine->done)
2984 pict_data_s *picture = engine->pict_data;
2985 motion_comp_s *mc_data = engine->motion_comp;
2986 int secondfield = engine->secondfield;
2987 int ipflag = engine->ipflag;
2988 mbinfo_s *mbi = picture->mbinfo + (engine->start_row / 16) * (width / 16);
2989 int i, j;
2990 int mb_row_incr; /* Offset increment to go down 1 row of mb's */
2991 int mb_row_start;
2993 if (picture->pict_struct == FRAME_PICTURE)
2995 mb_row_incr = 16*width;
2996 mb_row_start = engine->start_row / 16 * mb_row_incr;
2997 /* loop through all macroblocks of a frame picture */
2998 for (j=engine->start_row; j < engine->end_row; j+=16)
3000 for (i=0; i<width; i+=16)
3002 frame_ME(engine, picture, mc_data, mb_row_start, i,j, mbi);
3003 mbi++;
3005 mb_row_start += mb_row_incr;
3008 else
3010 mb_row_incr = (16 * 2) * width;
3011 mb_row_start = engine->start_row / 16 * mb_row_incr;
3012 /* loop through all macroblocks of a field picture */
3013 for (j=engine->start_row; j < engine->end_row; j+=16)
3015 for (i=0; i<width; i+=16)
3017 field_ME(engine, picture, mc_data, mb_row_start, i,j,
3018 mbi,secondfield,ipflag);
3019 mbi++;
3021 mb_row_start += mb_row_incr;
3026 pthread_mutex_unlock(&(engine->output_lock));
3031 void motion_estimation(pict_data_s *picture,
3032 motion_comp_s *mc_data,
3033 int secondfield, int ipflag)
3035 int i;
3036 /* Start loop */
3037 for(i = 0; i < processors; i++)
3039 motion_engines[i].motion_comp = mc_data;
3041 motion_engines[i].pict_data = picture;
3043 motion_engines[i].secondfield = secondfield;
3044 motion_engines[i].ipflag = ipflag;
3045 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3048 /* Wait for completion */
3049 for(i = 0; i < processors; i++)
3051 pthread_mutex_lock(&(motion_engines[i].output_lock));
3055 void start_motion_engines()
3057 int i;
3058 int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
3059 int current_row = 0;
3060 pthread_attr_t attr;
3061 pthread_mutexattr_t mutex_attr;
3063 pthread_mutexattr_init(&mutex_attr);
3064 pthread_attr_init(&attr);
3065 motion_engines = calloc(1, sizeof(motion_engine_t) * processors);
3066 for(i = 0; i < processors; i++)
3068 motion_engines[i].start_row = current_row * i * 16;
3069 current_row += rows_per_processor;
3070 if(current_row > height2 / 16) current_row = height2 / 16;
3071 motion_engines[i].end_row = current_row * 16;
3072 pthread_mutex_init(&(motion_engines[i].input_lock), &mutex_attr);
3073 pthread_mutex_lock(&(motion_engines[i].input_lock));
3074 pthread_mutex_init(&(motion_engines[i].output_lock), &mutex_attr);
3075 pthread_mutex_lock(&(motion_engines[i].output_lock));
3076 motion_engines[i].done = 0;
3077 pthread_create(&(motion_engines[i].tid),
3078 &attr,
3079 (void*)motion_engine_loop,
3080 &motion_engines[i]);
3084 void stop_motion_engines()
3086 int i;
3087 for(i = 0; i < processors; i++)
3089 motion_engines[i].done = 1;
3090 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3091 pthread_join(motion_engines[i].tid, 0);
3092 pthread_mutex_destroy(&(motion_engines[i].input_lock));
3093 pthread_mutex_destroy(&(motion_engines[i].output_lock));
3095 free(motion_engines);