libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1591                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1592                            int src_x_offset, int src_y_offset,
1593                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1594     MpegEncContext * const s = &h->s;
1595     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1596     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1597     const int luma_xy= (mx&3) + ((my&3)<<2);
1598     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1599     uint8_t * src_cb, * src_cr;
1600     int extra_width= h->emu_edge_width;
1601     int extra_height= h->emu_edge_height;
1602     int emu=0;
1603     const int full_mx= mx>>2;
1604     const int full_my= my>>2;
1605     const int pic_width  = 16*s->mb_width;
1606     const int pic_height = 16*s->mb_height >> MB_FIELD;
1607
1608     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1609         return;
1610
1611     if(mx&7) extra_width -= 3;
1612     if(my&7) extra_height -= 3;
1613
1614     if(   full_mx < 0-extra_width
1615        || full_my < 0-extra_height
1616        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1617        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1618         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1619             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1620         emu=1;
1621     }
1622
1623     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1624     if(!square){
1625         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1626     }
1627
1628     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1629
1630     if(MB_FIELD){
1631         // chroma offset when predicting from a field of opposite parity
1632         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1633         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1634     }
1635     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1636     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1637
1638     if(emu){
1639         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1640             src_cb= s->edge_emu_buffer;
1641     }
1642     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1643
1644     if(emu){
1645         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1646             src_cr= s->edge_emu_buffer;
1647     }
1648     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1649 }
1650
1651 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1652                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1653                            int x_offset, int y_offset,
1654                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1655                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1656                            int list0, int list1){
1657     MpegEncContext * const s = &h->s;
1658     qpel_mc_func *qpix_op=  qpix_put;
1659     h264_chroma_mc_func chroma_op= chroma_put;
1660
1661     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1662     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1663     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1664     x_offset += 8*s->mb_x;
1665     y_offset += 8*(s->mb_y >> MB_FIELD);
1666
1667     if(list0){
1668         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1669         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1670                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1671                            qpix_op, chroma_op);
1672
1673         qpix_op=  qpix_avg;
1674         chroma_op= chroma_avg;
1675     }
1676
1677     if(list1){
1678         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1679         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1680                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1681                            qpix_op, chroma_op);
1682     }
1683 }
1684
1685 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1686                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1687                            int x_offset, int y_offset,
1688                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1689                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1690                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1691                            int list0, int list1){
1692     MpegEncContext * const s = &h->s;
1693
1694     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1695     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1696     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1697     x_offset += 8*s->mb_x;
1698     y_offset += 8*(s->mb_y >> MB_FIELD);
1699
1700     if(list0 && list1){
1701         /* don't optimize for luma-only case, since B-frames usually
1702          * use implicit weights => chroma too. */
1703         uint8_t *tmp_cb = s->obmc_scratchpad;
1704         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1705         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1706         int refn0 = h->ref_cache[0][ scan8[n] ];
1707         int refn1 = h->ref_cache[1][ scan8[n] ];
1708
1709         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1710                     dest_y, dest_cb, dest_cr,
1711                     x_offset, y_offset, qpix_put, chroma_put);
1712         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1713                     tmp_y, tmp_cb, tmp_cr,
1714                     x_offset, y_offset, qpix_put, chroma_put);
1715
1716         if(h->use_weight == 2){
1717             int weight0 = h->implicit_weight[refn0][refn1];
1718             int weight1 = 64 - weight0;
1719             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1720             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1721             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1722         }else{
1723             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1724                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1725                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1726             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1727                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1728                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1729             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1730                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1731                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1732         }
1733     }else{
1734         int list = list1 ? 1 : 0;
1735         int refn = h->ref_cache[list][ scan8[n] ];
1736         Picture *ref= &h->ref_list[list][refn];
1737         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1738                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1739                     qpix_put, chroma_put);
1740
1741         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1742                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1743         if(h->use_weight_chroma){
1744             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1745                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1746             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1747                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1748         }
1749     }
1750 }
1751
1752 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1753                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1754                            int x_offset, int y_offset,
1755                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1756                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1757                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1758                            int list0, int list1){
1759     if((h->use_weight==2 && list0 && list1
1760         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1761        || h->use_weight==1)
1762         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1763                          x_offset, y_offset, qpix_put, chroma_put,
1764                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1765     else
1766         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1767                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1768 }
1769
1770 static inline void prefetch_motion(H264Context *h, int list){
1771     /* fetch pixels for estimated mv 4 macroblocks ahead
1772      * optimized for 64byte cache lines */
1773     MpegEncContext * const s = &h->s;
1774     const int refn = h->ref_cache[list][scan8[0]];
1775     if(refn >= 0){
1776         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1777         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1778         uint8_t **src= h->ref_list[list][refn].data;
1779         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1780         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1781         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1782         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1783     }
1784 }
1785
1786 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1787                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1788                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1789                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1790     MpegEncContext * const s = &h->s;
1791     const int mb_xy= h->mb_xy;
1792     const int mb_type= s->current_picture.mb_type[mb_xy];
1793
1794     assert(IS_INTER(mb_type));
1795
1796     prefetch_motion(h, 0);
1797
1798     if(IS_16X16(mb_type)){
1799         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1800                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1801                 &weight_op[0], &weight_avg[0],
1802                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1803     }else if(IS_16X8(mb_type)){
1804         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1805                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1806                 &weight_op[1], &weight_avg[1],
1807                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1808         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1809                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1810                 &weight_op[1], &weight_avg[1],
1811                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1812     }else if(IS_8X16(mb_type)){
1813         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1815                 &weight_op[2], &weight_avg[2],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1818                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1819                 &weight_op[2], &weight_avg[2],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else{
1822         int i;
1823
1824         assert(IS_8X8(mb_type));
1825
1826         for(i=0; i<4; i++){
1827             const int sub_mb_type= h->sub_mb_type[i];
1828             const int n= 4*i;
1829             int x_offset= (i&1)<<2;
1830             int y_offset= (i&2)<<1;
1831
1832             if(IS_SUB_8X8(sub_mb_type)){
1833                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1834                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1835                     &weight_op[3], &weight_avg[3],
1836                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1837             }else if(IS_SUB_8X4(sub_mb_type)){
1838                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1839                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1840                     &weight_op[4], &weight_avg[4],
1841                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1842                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1843                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1844                     &weight_op[4], &weight_avg[4],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_4X8(sub_mb_type)){
1847                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1849                     &weight_op[5], &weight_avg[5],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1852                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1853                     &weight_op[5], &weight_avg[5],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else{
1856                 int j;
1857                 assert(IS_SUB_4X4(sub_mb_type));
1858                 for(j=0; j<4; j++){
1859                     int sub_x_offset= x_offset + 2*(j&1);
1860                     int sub_y_offset= y_offset +   (j&2);
1861                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1862                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1863                         &weight_op[6], &weight_avg[6],
1864                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1865                 }
1866             }
1867         }
1868     }
1869
1870     prefetch_motion(h, 1);
1871 }
1872
1873 static av_cold void decode_init_vlc(void){
1874     static int done = 0;
1875
1876     if (!done) {
1877         int i;
1878         int offset;
1879         done = 1;
1880
1881         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1882         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1883         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1884                  &chroma_dc_coeff_token_len [0], 1, 1,
1885                  &chroma_dc_coeff_token_bits[0], 1, 1,
1886                  INIT_VLC_USE_NEW_STATIC);
1887
1888         offset = 0;
1889         for(i=0; i<4; i++){
1890             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1891             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1892             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1893                      &coeff_token_len [i][0], 1, 1,
1894                      &coeff_token_bits[i][0], 1, 1,
1895                      INIT_VLC_USE_NEW_STATIC);
1896             offset += coeff_token_vlc_tables_size[i];
1897         }
1898         /*
1899          * This is a one time safety check to make sure that
1900          * the packed static coeff_token_vlc table sizes
1901          * were initialized correctly.
1902          */
1903         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1904
1905         for(i=0; i<3; i++){
1906             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1907             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1908             init_vlc(&chroma_dc_total_zeros_vlc[i],
1909                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1910                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1911                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1912                      INIT_VLC_USE_NEW_STATIC);
1913         }
1914         for(i=0; i<15; i++){
1915             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1916             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1917             init_vlc(&total_zeros_vlc[i],
1918                      TOTAL_ZEROS_VLC_BITS, 16,
1919                      &total_zeros_len [i][0], 1, 1,
1920                      &total_zeros_bits[i][0], 1, 1,
1921                      INIT_VLC_USE_NEW_STATIC);
1922         }
1923
1924         for(i=0; i<6; i++){
1925             run_vlc[i].table = run_vlc_tables[i];
1926             run_vlc[i].table_allocated = run_vlc_tables_size;
1927             init_vlc(&run_vlc[i],
1928                      RUN_VLC_BITS, 7,
1929                      &run_len [i][0], 1, 1,
1930                      &run_bits[i][0], 1, 1,
1931                      INIT_VLC_USE_NEW_STATIC);
1932         }
1933         run7_vlc.table = run7_vlc_table,
1934         run7_vlc.table_allocated = run7_vlc_table_size;
1935         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1936                  &run_len [6][0], 1, 1,
1937                  &run_bits[6][0], 1, 1,
1938                  INIT_VLC_USE_NEW_STATIC);
1939     }
1940 }
1941
1942 static void free_tables(H264Context *h){
1943     int i;
1944     H264Context *hx;
1945     av_freep(&h->intra4x4_pred_mode);
1946     av_freep(&h->chroma_pred_mode_table);
1947     av_freep(&h->cbp_table);
1948     av_freep(&h->mvd_table[0]);
1949     av_freep(&h->mvd_table[1]);
1950     av_freep(&h->direct_table);
1951     av_freep(&h->non_zero_count);
1952     av_freep(&h->slice_table_base);
1953     h->slice_table= NULL;
1954
1955     av_freep(&h->mb2b_xy);
1956     av_freep(&h->mb2b8_xy);
1957
1958     for(i = 0; i < h->s.avctx->thread_count; i++) {
1959         hx = h->thread_context[i];
1960         if(!hx) continue;
1961         av_freep(&hx->top_borders[1]);
1962         av_freep(&hx->top_borders[0]);
1963         av_freep(&hx->s.obmc_scratchpad);
1964     }
1965 }
1966
1967 static void init_dequant8_coeff_table(H264Context *h){
1968     int i,q,x;
1969     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1970     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1971     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1972
1973     for(i=0; i<2; i++ ){
1974         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1975             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1976             break;
1977         }
1978
1979         for(q=0; q<52; q++){
1980             int shift = div6[q];
1981             int idx = rem6[q];
1982             for(x=0; x<64; x++)
1983                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1984                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1985                     h->pps.scaling_matrix8[i][x]) << shift;
1986         }
1987     }
1988 }
1989
1990 static void init_dequant4_coeff_table(H264Context *h){
1991     int i,j,q,x;
1992     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1993     for(i=0; i<6; i++ ){
1994         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1995         for(j=0; j<i; j++){
1996             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1997                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1998                 break;
1999             }
2000         }
2001         if(j<i)
2002             continue;
2003
2004         for(q=0; q<52; q++){
2005             int shift = div6[q] + 2;
2006             int idx = rem6[q];
2007             for(x=0; x<16; x++)
2008                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2009                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2010                     h->pps.scaling_matrix4[i][x]) << shift;
2011         }
2012     }
2013 }
2014
2015 static void init_dequant_tables(H264Context *h){
2016     int i,x;
2017     init_dequant4_coeff_table(h);
2018     if(h->pps.transform_8x8_mode)
2019         init_dequant8_coeff_table(h);
2020     if(h->sps.transform_bypass){
2021         for(i=0; i<6; i++)
2022             for(x=0; x<16; x++)
2023                 h->dequant4_coeff[i][0][x] = 1<<6;
2024         if(h->pps.transform_8x8_mode)
2025             for(i=0; i<2; i++)
2026                 for(x=0; x<64; x++)
2027                     h->dequant8_coeff[i][0][x] = 1<<6;
2028     }
2029 }
2030
2031
2032 /**
2033  * allocates tables.
2034  * needs width/height
2035  */
2036 static int alloc_tables(H264Context *h){
2037     MpegEncContext * const s = &h->s;
2038     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2039     int x,y;
2040
2041     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2042
2043     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2044     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2045     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2046
2047     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2048     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2049     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2050     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2051
2052     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2053     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2054
2055     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2056     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2057     for(y=0; y<s->mb_height; y++){
2058         for(x=0; x<s->mb_width; x++){
2059             const int mb_xy= x + y*s->mb_stride;
2060             const int b_xy = 4*x + 4*y*h->b_stride;
2061             const int b8_xy= 2*x + 2*y*h->b8_stride;
2062
2063             h->mb2b_xy [mb_xy]= b_xy;
2064             h->mb2b8_xy[mb_xy]= b8_xy;
2065         }
2066     }
2067
2068     s->obmc_scratchpad = NULL;
2069
2070     if(!h->dequant4_coeff[0])
2071         init_dequant_tables(h);
2072
2073     return 0;
2074 fail:
2075     free_tables(h);
2076     return -1;
2077 }
2078
2079 /**
2080  * Mimic alloc_tables(), but for every context thread.
2081  */
2082 static void clone_tables(H264Context *dst, H264Context *src){
2083     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2084     dst->non_zero_count           = src->non_zero_count;
2085     dst->slice_table              = src->slice_table;
2086     dst->cbp_table                = src->cbp_table;
2087     dst->mb2b_xy                  = src->mb2b_xy;
2088     dst->mb2b8_xy                 = src->mb2b8_xy;
2089     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2090     dst->mvd_table[0]             = src->mvd_table[0];
2091     dst->mvd_table[1]             = src->mvd_table[1];
2092     dst->direct_table             = src->direct_table;
2093
2094     dst->s.obmc_scratchpad = NULL;
2095     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2096 }
2097
2098 /**
2099  * Init context
2100  * Allocate buffers which are not shared amongst multiple threads.
2101  */
2102 static int context_init(H264Context *h){
2103     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2104     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2105
2106     return 0;
2107 fail:
2108     return -1; // free_tables will clean up for us
2109 }
2110
2111 static av_cold void common_init(H264Context *h){
2112     MpegEncContext * const s = &h->s;
2113
2114     s->width = s->avctx->width;
2115     s->height = s->avctx->height;
2116     s->codec_id= s->avctx->codec->id;
2117
2118     ff_h264_pred_init(&h->hpc, s->codec_id);
2119
2120     h->dequant_coeff_pps= -1;
2121     s->unrestricted_mv=1;
2122     s->decode=1; //FIXME
2123
2124     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2125     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2126 }
2127
2128 static av_cold int decode_init(AVCodecContext *avctx){
2129     H264Context *h= avctx->priv_data;
2130     MpegEncContext * const s = &h->s;
2131
2132     MPV_decode_defaults(s);
2133
2134     s->avctx = avctx;
2135     common_init(h);
2136
2137     s->out_format = FMT_H264;
2138     s->workaround_bugs= avctx->workaround_bugs;
2139
2140     // set defaults
2141 //    s->decode_mb= ff_h263_decode_mb;
2142     s->quarter_sample = 1;
2143     s->low_delay= 1;
2144
2145     if(avctx->codec_id == CODEC_ID_SVQ3)
2146         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2147     else
2148         avctx->pix_fmt= PIX_FMT_YUV420P;
2149
2150     decode_init_vlc();
2151
2152     if(avctx->extradata_size > 0 && avctx->extradata &&
2153        *(char *)avctx->extradata == 1){
2154         h->is_avc = 1;
2155         h->got_avcC = 0;
2156     } else {
2157         h->is_avc = 0;
2158     }
2159
2160     h->thread_context[0] = h;
2161     h->outputed_poc = INT_MIN;
2162     h->prev_poc_msb= 1<<16;
2163     return 0;
2164 }
2165
2166 static int frame_start(H264Context *h){
2167     MpegEncContext * const s = &h->s;
2168     int i;
2169
2170     if(MPV_frame_start(s, s->avctx) < 0)
2171         return -1;
2172     ff_er_frame_start(s);
2173     /*
2174      * MPV_frame_start uses pict_type to derive key_frame.
2175      * This is incorrect for H.264; IDR markings must be used.
2176      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2177      * See decode_nal_units().
2178      */
2179     s->current_picture_ptr->key_frame= 0;
2180
2181     assert(s->linesize && s->uvlinesize);
2182
2183     for(i=0; i<16; i++){
2184         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2185         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2186     }
2187     for(i=0; i<4; i++){
2188         h->block_offset[16+i]=
2189         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2190         h->block_offset[24+16+i]=
2191         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2192     }
2193
2194     /* can't be in alloc_tables because linesize isn't known there.
2195      * FIXME: redo bipred weight to not require extra buffer? */
2196     for(i = 0; i < s->avctx->thread_count; i++)
2197         if(!h->thread_context[i]->s.obmc_scratchpad)
2198             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2199
2200     /* some macroblocks will be accessed before they're available */
2201     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2202         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2203
2204 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2205
2206     // We mark the current picture as non-reference after allocating it, so
2207     // that if we break out due to an error it can be released automatically
2208     // in the next MPV_frame_start().
2209     // SVQ3 as well as most other codecs have only last/next/current and thus
2210     // get released even with set reference, besides SVQ3 and others do not
2211     // mark frames as reference later "naturally".
2212     if(s->codec_id != CODEC_ID_SVQ3)
2213         s->current_picture_ptr->reference= 0;
2214
2215     s->current_picture_ptr->field_poc[0]=
2216     s->current_picture_ptr->field_poc[1]= INT_MAX;
2217     assert(s->current_picture_ptr->long_ref==0);
2218
2219     return 0;
2220 }
2221
2222 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2223     MpegEncContext * const s = &h->s;
2224     int i;
2225     int step    = 1;
2226     int offset  = 1;
2227     int uvoffset= 1;
2228     int top_idx = 1;
2229     int skiplast= 0;
2230
2231     src_y  -=   linesize;
2232     src_cb -= uvlinesize;
2233     src_cr -= uvlinesize;
2234
2235     if(!simple && FRAME_MBAFF){
2236         if(s->mb_y&1){
2237             offset  = MB_MBAFF ? 1 : 17;
2238             uvoffset= MB_MBAFF ? 1 : 9;
2239             if(!MB_MBAFF){
2240                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2241                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2242                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2243                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2244                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2245                 }
2246             }
2247         }else{
2248             if(!MB_MBAFF){
2249                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2250                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2251                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2252                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2253                 }
2254                 skiplast= 1;
2255             }
2256             offset  =
2257             uvoffset=
2258             top_idx = MB_MBAFF ? 0 : 1;
2259         }
2260         step= MB_MBAFF ? 2 : 1;
2261     }
2262
2263     // There are two lines saved, the line above the the top macroblock of a pair,
2264     // and the line above the bottom macroblock
2265     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2266     for(i=1; i<17 - skiplast; i++){
2267         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2268     }
2269
2270     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2271     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2272
2273     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2274         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2275         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2276         for(i=1; i<9 - skiplast; i++){
2277             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2278             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2279         }
2280         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2281         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2282     }
2283 }
2284
2285 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2286     MpegEncContext * const s = &h->s;
2287     int temp8, i;
2288     uint64_t temp64;
2289     int deblock_left;
2290     int deblock_top;
2291     int mb_xy;
2292     int step    = 1;
2293     int offset  = 1;
2294     int uvoffset= 1;
2295     int top_idx = 1;
2296
2297     if(!simple && FRAME_MBAFF){
2298         if(s->mb_y&1){
2299             offset  = MB_MBAFF ? 1 : 17;
2300             uvoffset= MB_MBAFF ? 1 : 9;
2301         }else{
2302             offset  =
2303             uvoffset=
2304             top_idx = MB_MBAFF ? 0 : 1;
2305         }
2306         step= MB_MBAFF ? 2 : 1;
2307     }
2308
2309     if(h->deblocking_filter == 2) {
2310         mb_xy = h->mb_xy;
2311         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2312         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2313     } else {
2314         deblock_left = (s->mb_x > 0);
2315         deblock_top =  (s->mb_y > !!MB_FIELD);
2316     }
2317
2318     src_y  -=   linesize + 1;
2319     src_cb -= uvlinesize + 1;
2320     src_cr -= uvlinesize + 1;
2321
2322 #define XCHG(a,b,t,xchg)\
2323 t= a;\
2324 if(xchg)\
2325     a= b;\
2326 b= t;
2327
2328     if(deblock_left){
2329         for(i = !deblock_top; i<16; i++){
2330             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2331         }
2332         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2333     }
2334
2335     if(deblock_top){
2336         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2337         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2338         if(s->mb_x+1 < s->mb_width){
2339             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2340         }
2341     }
2342
2343     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2344         if(deblock_left){
2345             for(i = !deblock_top; i<8; i++){
2346                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2347                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2348             }
2349             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2350             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2351         }
2352         if(deblock_top){
2353             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2354             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2355         }
2356     }
2357 }
2358
2359 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2360     MpegEncContext * const s = &h->s;
2361     const int mb_x= s->mb_x;
2362     const int mb_y= s->mb_y;
2363     const int mb_xy= h->mb_xy;
2364     const int mb_type= s->current_picture.mb_type[mb_xy];
2365     uint8_t  *dest_y, *dest_cb, *dest_cr;
2366     int linesize, uvlinesize /*dct_offset*/;
2367     int i;
2368     int *block_offset = &h->block_offset[0];
2369     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2370     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2371     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2372
2373     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2374     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2375     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2376
2377     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2378     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2379
2380     if (!simple && MB_FIELD) {
2381         linesize   = h->mb_linesize   = s->linesize * 2;
2382         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2383         block_offset = &h->block_offset[24];
2384         if(mb_y&1){ //FIXME move out of this function?
2385             dest_y -= s->linesize*15;
2386             dest_cb-= s->uvlinesize*7;
2387             dest_cr-= s->uvlinesize*7;
2388         }
2389         if(FRAME_MBAFF) {
2390             int list;
2391             for(list=0; list<h->list_count; list++){
2392                 if(!USES_LIST(mb_type, list))
2393                     continue;
2394                 if(IS_16X16(mb_type)){
2395                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2396                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2397                 }else{
2398                     for(i=0; i<16; i+=4){
2399                         int ref = h->ref_cache[list][scan8[i]];
2400                         if(ref >= 0)
2401                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2402                     }
2403                 }
2404             }
2405         }
2406     } else {
2407         linesize   = h->mb_linesize   = s->linesize;
2408         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2409 //        dct_offset = s->linesize * 16;
2410     }
2411
2412     if(transform_bypass){
2413         idct_dc_add =
2414         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2415     }else if(IS_8x8DCT(mb_type)){
2416         idct_dc_add = s->dsp.h264_idct8_dc_add;
2417         idct_add = s->dsp.h264_idct8_add;
2418     }else{
2419         idct_dc_add = s->dsp.h264_idct_dc_add;
2420         idct_add = s->dsp.h264_idct_add;
2421     }
2422
2423     if (!simple && IS_INTRA_PCM(mb_type)) {
2424         for (i=0; i<16; i++) {
2425             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2426         }
2427         for (i=0; i<8; i++) {
2428             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2429             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2430         }
2431     } else {
2432         if(IS_INTRA(mb_type)){
2433             if(h->deblocking_filter)
2434                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2435
2436             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2437                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2438                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2439             }
2440
2441             if(IS_INTRA4x4(mb_type)){
2442                 if(simple || !s->encoding){
2443                     if(IS_8x8DCT(mb_type)){
2444                         for(i=0; i<16; i+=4){
2445                             uint8_t * const ptr= dest_y + block_offset[i];
2446                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2447                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2448                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2449                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2450                             if(nnz){
2451                                 if(nnz == 1 && h->mb[i*16])
2452                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2453                                 else
2454                                     idct_add(ptr, h->mb + i*16, linesize);
2455                             }
2456                         }
2457                     }else
2458                     for(i=0; i<16; i++){
2459                         uint8_t * const ptr= dest_y + block_offset[i];
2460                         uint8_t *topright;
2461                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2462                         int nnz, tr;
2463
2464                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2465                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2466                             assert(mb_y || linesize <= block_offset[i]);
2467                             if(!topright_avail){
2468                                 tr= ptr[3 - linesize]*0x01010101;
2469                                 topright= (uint8_t*) &tr;
2470                             }else
2471                                 topright= ptr + 4 - linesize;
2472                         }else
2473                             topright= NULL;
2474
2475                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2476                         nnz = h->non_zero_count_cache[ scan8[i] ];
2477                         if(nnz){
2478                             if(is_h264){
2479                                 if(nnz == 1 && h->mb[i*16])
2480                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2481                                 else
2482                                     idct_add(ptr, h->mb + i*16, linesize);
2483                             }else
2484                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2485                         }
2486                     }
2487                 }
2488             }else{
2489                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2490                 if(is_h264){
2491                     if(!transform_bypass)
2492                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2493                 }else
2494                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2495             }
2496             if(h->deblocking_filter)
2497                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2498         }else if(is_h264){
2499             hl_motion(h, dest_y, dest_cb, dest_cr,
2500                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2501                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2502                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2503         }
2504
2505
2506         if(!IS_INTRA4x4(mb_type)){
2507             if(is_h264){
2508                 if(IS_INTRA16x16(mb_type)){
2509                     for(i=0; i<16; i++){
2510                         if(h->non_zero_count_cache[ scan8[i] ])
2511                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2512                         else if(h->mb[i*16])
2513                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2514                     }
2515                 }else{
2516                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2517                     for(i=0; i<16; i+=di){
2518                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2519                         if(nnz){
2520                             if(nnz==1 && h->mb[i*16])
2521                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2522                             else
2523                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2524                         }
2525                     }
2526                 }
2527             }else{
2528                 for(i=0; i<16; i++){
2529                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2530                         uint8_t * const ptr= dest_y + block_offset[i];
2531                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2532                     }
2533                 }
2534             }
2535         }
2536
2537         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2538             uint8_t *dest[2] = {dest_cb, dest_cr};
2539             if(transform_bypass){
2540                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2541             }else{
2542                 idct_add = s->dsp.h264_idct_add;
2543                 idct_dc_add = s->dsp.h264_idct_dc_add;
2544                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2545                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2546             }
2547             if(is_h264){
2548                 for(i=16; i<16+8; i++){
2549                     if(h->non_zero_count_cache[ scan8[i] ])
2550                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2551                     else if(h->mb[i*16])
2552                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2553                 }
2554             }else{
2555                 for(i=16; i<16+8; i++){
2556                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2557                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2558                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2559                     }
2560                 }
2561             }
2562         }
2563     }
2564     if(h->deblocking_filter) {
2565         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2566         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2567         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2568         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2569         if (!simple && FRAME_MBAFF) {
2570             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2571         } else {
2572             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2573         }
2574     }
2575 }
2576
2577 /**
2578  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2579  */
2580 static void hl_decode_mb_simple(H264Context *h){
2581     hl_decode_mb_internal(h, 1);
2582 }
2583
2584 /**
2585  * Process a macroblock; this handles edge cases, such as interlacing.
2586  */
2587 static void av_noinline hl_decode_mb_complex(H264Context *h){
2588     hl_decode_mb_internal(h, 0);
2589 }
2590
2591 static void hl_decode_mb(H264Context *h){
2592     MpegEncContext * const s = &h->s;
2593     const int mb_xy= h->mb_xy;
2594     const int mb_type= s->current_picture.mb_type[mb_xy];
2595     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2596                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2597
2598     if(ENABLE_H264_ENCODER && !s->decode)
2599         return;
2600
2601     if (is_complex)
2602         hl_decode_mb_complex(h);
2603     else hl_decode_mb_simple(h);
2604 }
2605
2606 static void pic_as_field(Picture *pic, const int parity){
2607     int i;
2608     for (i = 0; i < 4; ++i) {
2609         if (parity == PICT_BOTTOM_FIELD)
2610             pic->data[i] += pic->linesize[i];
2611         pic->reference = parity;
2612         pic->linesize[i] *= 2;
2613     }
2614     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2615 }
2616
2617 static int split_field_copy(Picture *dest, Picture *src,
2618                             int parity, int id_add){
2619     int match = !!(src->reference & parity);
2620
2621     if (match) {
2622         *dest = *src;
2623         if(parity != PICT_FRAME){
2624             pic_as_field(dest, parity);
2625             dest->pic_id *= 2;
2626             dest->pic_id += id_add;
2627         }
2628     }
2629
2630     return match;
2631 }
2632
2633 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2634     int i[2]={0};
2635     int index=0;
2636
2637     while(i[0]<len || i[1]<len){
2638         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2639             i[0]++;
2640         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2641             i[1]++;
2642         if(i[0] < len){
2643             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2644             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2645         }
2646         if(i[1] < len){
2647             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2648             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2649         }
2650     }
2651
2652     return index;
2653 }
2654
2655 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2656     int i, best_poc;
2657     int out_i= 0;
2658
2659     for(;;){
2660         best_poc= dir ? INT_MIN : INT_MAX;
2661
2662         for(i=0; i<len; i++){
2663             const int poc= src[i]->poc;
2664             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2665                 best_poc= poc;
2666                 sorted[out_i]= src[i];
2667             }
2668         }
2669         if(best_poc == (dir ? INT_MIN : INT_MAX))
2670             break;
2671         limit= sorted[out_i++]->poc - dir;
2672     }
2673     return out_i;
2674 }
2675
2676 /**
2677  * fills the default_ref_list.
2678  */
2679 static int fill_default_ref_list(H264Context *h){
2680     MpegEncContext * const s = &h->s;
2681     int i, len;
2682
2683     if(h->slice_type_nos==FF_B_TYPE){
2684         Picture *sorted[32];
2685         int cur_poc, list;
2686         int lens[2];
2687
2688         if(FIELD_PICTURE)
2689             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2690         else
2691             cur_poc= s->current_picture_ptr->poc;
2692
2693         for(list= 0; list<2; list++){
2694             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2695             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2696             assert(len<=32);
2697             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2698             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2699             assert(len<=32);
2700
2701             if(len < h->ref_count[list])
2702                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2703             lens[list]= len;
2704         }
2705
2706         if(lens[0] == lens[1] && lens[1] > 1){
2707             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2708             if(i == lens[0])
2709                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2710         }
2711     }else{
2712         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2713         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2714         assert(len <= 32);
2715         if(len < h->ref_count[0])
2716             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2717     }
2718 #ifdef TRACE
2719     for (i=0; i<h->ref_count[0]; i++) {
2720         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2721     }
2722     if(h->slice_type_nos==FF_B_TYPE){
2723         for (i=0; i<h->ref_count[1]; i++) {
2724             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2725         }
2726     }
2727 #endif
2728     return 0;
2729 }
2730
2731 static void print_short_term(H264Context *h);
2732 static void print_long_term(H264Context *h);
2733
2734 /**
2735  * Extract structure information about the picture described by pic_num in
2736  * the current decoding context (frame or field). Note that pic_num is
2737  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2738  * @param pic_num picture number for which to extract structure information
2739  * @param structure one of PICT_XXX describing structure of picture
2740  *                      with pic_num
2741  * @return frame number (short term) or long term index of picture
2742  *         described by pic_num
2743  */
2744 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2745     MpegEncContext * const s = &h->s;
2746
2747     *structure = s->picture_structure;
2748     if(FIELD_PICTURE){
2749         if (!(pic_num & 1))
2750             /* opposite field */
2751             *structure ^= PICT_FRAME;
2752         pic_num >>= 1;
2753     }
2754
2755     return pic_num;
2756 }
2757
2758 static int decode_ref_pic_list_reordering(H264Context *h){
2759     MpegEncContext * const s = &h->s;
2760     int list, index, pic_structure;
2761
2762     print_short_term(h);
2763     print_long_term(h);
2764
2765     for(list=0; list<h->list_count; list++){
2766         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2767
2768         if(get_bits1(&s->gb)){
2769             int pred= h->curr_pic_num;
2770
2771             for(index=0; ; index++){
2772                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2773                 unsigned int pic_id;
2774                 int i;
2775                 Picture *ref = NULL;
2776
2777                 if(reordering_of_pic_nums_idc==3)
2778                     break;
2779
2780                 if(index >= h->ref_count[list]){
2781                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2782                     return -1;
2783                 }
2784
2785                 if(reordering_of_pic_nums_idc<3){
2786                     if(reordering_of_pic_nums_idc<2){
2787                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2788                         int frame_num;
2789
2790                         if(abs_diff_pic_num > h->max_pic_num){
2791                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2792                             return -1;
2793                         }
2794
2795                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2796                         else                                pred+= abs_diff_pic_num;
2797                         pred &= h->max_pic_num - 1;
2798
2799                         frame_num = pic_num_extract(h, pred, &pic_structure);
2800
2801                         for(i= h->short_ref_count-1; i>=0; i--){
2802                             ref = h->short_ref[i];
2803                             assert(ref->reference);
2804                             assert(!ref->long_ref);
2805                             if(
2806                                    ref->frame_num == frame_num &&
2807                                    (ref->reference & pic_structure)
2808                               )
2809                                 break;
2810                         }
2811                         if(i>=0)
2812                             ref->pic_id= pred;
2813                     }else{
2814                         int long_idx;
2815                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2816
2817                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2818
2819                         if(long_idx>31){
2820                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2821                             return -1;
2822                         }
2823                         ref = h->long_ref[long_idx];
2824                         assert(!(ref && !ref->reference));
2825                         if(ref && (ref->reference & pic_structure)){
2826                             ref->pic_id= pic_id;
2827                             assert(ref->long_ref);
2828                             i=0;
2829                         }else{
2830                             i=-1;
2831                         }
2832                     }
2833
2834                     if (i < 0) {
2835                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2836                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2837                     } else {
2838                         for(i=index; i+1<h->ref_count[list]; i++){
2839                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2840                                 break;
2841                         }
2842                         for(; i > index; i--){
2843                             h->ref_list[list][i]= h->ref_list[list][i-1];
2844                         }
2845                         h->ref_list[list][index]= *ref;
2846                         if (FIELD_PICTURE){
2847                             pic_as_field(&h->ref_list[list][index], pic_structure);
2848                         }
2849                     }
2850                 }else{
2851                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2852                     return -1;
2853                 }
2854             }
2855         }
2856     }
2857     for(list=0; list<h->list_count; list++){
2858         for(index= 0; index < h->ref_count[list]; index++){
2859             if(!h->ref_list[list][index].data[0]){
2860                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2861                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2862             }
2863         }
2864     }
2865
2866     return 0;
2867 }
2868
2869 static void fill_mbaff_ref_list(H264Context *h){
2870     int list, i, j;
2871     for(list=0; list<2; list++){ //FIXME try list_count
2872         for(i=0; i<h->ref_count[list]; i++){
2873             Picture *frame = &h->ref_list[list][i];
2874             Picture *field = &h->ref_list[list][16+2*i];
2875             field[0] = *frame;
2876             for(j=0; j<3; j++)
2877                 field[0].linesize[j] <<= 1;
2878             field[0].reference = PICT_TOP_FIELD;
2879             field[0].poc= field[0].field_poc[0];
2880             field[1] = field[0];
2881             for(j=0; j<3; j++)
2882                 field[1].data[j] += frame->linesize[j];
2883             field[1].reference = PICT_BOTTOM_FIELD;
2884             field[1].poc= field[1].field_poc[1];
2885
2886             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2887             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2888             for(j=0; j<2; j++){
2889                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2890                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2891             }
2892         }
2893     }
2894     for(j=0; j<h->ref_count[1]; j++){
2895         for(i=0; i<h->ref_count[0]; i++)
2896             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2897         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2898         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2899     }
2900 }
2901
2902 static int pred_weight_table(H264Context *h){
2903     MpegEncContext * const s = &h->s;
2904     int list, i;
2905     int luma_def, chroma_def;
2906
2907     h->use_weight= 0;
2908     h->use_weight_chroma= 0;
2909     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2910     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2911     luma_def = 1<<h->luma_log2_weight_denom;
2912     chroma_def = 1<<h->chroma_log2_weight_denom;
2913
2914     for(list=0; list<2; list++){
2915         for(i=0; i<h->ref_count[list]; i++){
2916             int luma_weight_flag, chroma_weight_flag;
2917
2918             luma_weight_flag= get_bits1(&s->gb);
2919             if(luma_weight_flag){
2920                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2921                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2922                 if(   h->luma_weight[list][i] != luma_def
2923                    || h->luma_offset[list][i] != 0)
2924                     h->use_weight= 1;
2925             }else{
2926                 h->luma_weight[list][i]= luma_def;
2927                 h->luma_offset[list][i]= 0;
2928             }
2929
2930             if(CHROMA){
2931                 chroma_weight_flag= get_bits1(&s->gb);
2932                 if(chroma_weight_flag){
2933                     int j;
2934                     for(j=0; j<2; j++){
2935                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2936                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2937                         if(   h->chroma_weight[list][i][j] != chroma_def
2938                         || h->chroma_offset[list][i][j] != 0)
2939                             h->use_weight_chroma= 1;
2940                     }
2941                 }else{
2942                     int j;
2943                     for(j=0; j<2; j++){
2944                         h->chroma_weight[list][i][j]= chroma_def;
2945                         h->chroma_offset[list][i][j]= 0;
2946                     }
2947                 }
2948             }
2949         }
2950         if(h->slice_type_nos != FF_B_TYPE) break;
2951     }
2952     h->use_weight= h->use_weight || h->use_weight_chroma;
2953     return 0;
2954 }
2955
2956 static void implicit_weight_table(H264Context *h){
2957     MpegEncContext * const s = &h->s;
2958     int ref0, ref1;
2959     int cur_poc = s->current_picture_ptr->poc;
2960
2961     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2962        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2963         h->use_weight= 0;
2964         h->use_weight_chroma= 0;
2965         return;
2966     }
2967
2968     h->use_weight= 2;
2969     h->use_weight_chroma= 2;
2970     h->luma_log2_weight_denom= 5;
2971     h->chroma_log2_weight_denom= 5;
2972
2973     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
2974         int poc0 = h->ref_list[0][ref0].poc;
2975         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
2976             int poc1 = h->ref_list[1][ref1].poc;
2977             int td = av_clip(poc1 - poc0, -128, 127);
2978             if(td){
2979                 int tb = av_clip(cur_poc - poc0, -128, 127);
2980                 int tx = (16384 + (FFABS(td) >> 1)) / td;
2981                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
2982                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
2983                     h->implicit_weight[ref0][ref1] = 32;
2984                 else
2985                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
2986             }else
2987                 h->implicit_weight[ref0][ref1] = 32;
2988         }
2989     }
2990 }
2991
2992 /**
2993  * Mark a picture as no longer needed for reference. The refmask
2994  * argument allows unreferencing of individual fields or the whole frame.
2995  * If the picture becomes entirely unreferenced, but is being held for
2996  * display purposes, it is marked as such.
2997  * @param refmask mask of fields to unreference; the mask is bitwise
2998  *                anded with the reference marking of pic
2999  * @return non-zero if pic becomes entirely unreferenced (except possibly
3000  *         for display purposes) zero if one of the fields remains in
3001  *         reference
3002  */
3003 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3004     int i;
3005     if (pic->reference &= refmask) {
3006         return 0;
3007     } else {
3008         for(i = 0; h->delayed_pic[i]; i++)
3009             if(pic == h->delayed_pic[i]){
3010                 pic->reference=DELAYED_PIC_REF;
3011                 break;
3012             }
3013         return 1;
3014     }
3015 }
3016
3017 /**
3018  * instantaneous decoder refresh.
3019  */
3020 static void idr(H264Context *h){
3021     int i;
3022
3023     for(i=0; i<16; i++){
3024         remove_long(h, i, 0);
3025     }
3026     assert(h->long_ref_count==0);
3027
3028     for(i=0; i<h->short_ref_count; i++){
3029         unreference_pic(h, h->short_ref[i], 0);
3030         h->short_ref[i]= NULL;
3031     }
3032     h->short_ref_count=0;
3033     h->prev_frame_num= 0;
3034     h->prev_frame_num_offset= 0;
3035     h->prev_poc_msb=
3036     h->prev_poc_lsb= 0;
3037 }
3038
3039 /* forget old pics after a seek */
3040 static void flush_dpb(AVCodecContext *avctx){
3041     H264Context *h= avctx->priv_data;
3042     int i;
3043     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3044         if(h->delayed_pic[i])
3045             h->delayed_pic[i]->reference= 0;
3046         h->delayed_pic[i]= NULL;
3047     }
3048     h->outputed_poc= INT_MIN;
3049     idr(h);
3050     if(h->s.current_picture_ptr)
3051         h->s.current_picture_ptr->reference= 0;
3052     h->s.first_field= 0;
3053     ff_mpeg_flush(avctx);
3054 }
3055
3056 /**
3057  * Find a Picture in the short term reference list by frame number.
3058  * @param frame_num frame number to search for
3059  * @param idx the index into h->short_ref where returned picture is found
3060  *            undefined if no picture found.
3061  * @return pointer to the found picture, or NULL if no pic with the provided
3062  *                 frame number is found
3063  */
3064 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3065     MpegEncContext * const s = &h->s;
3066     int i;
3067
3068     for(i=0; i<h->short_ref_count; i++){
3069         Picture *pic= h->short_ref[i];
3070         if(s->avctx->debug&FF_DEBUG_MMCO)
3071             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3072         if(pic->frame_num == frame_num) {
3073             *idx = i;
3074             return pic;
3075         }
3076     }
3077     return NULL;
3078 }
3079
3080 /**
3081  * Remove a picture from the short term reference list by its index in
3082  * that list.  This does no checking on the provided index; it is assumed
3083  * to be valid. Other list entries are shifted down.
3084  * @param i index into h->short_ref of picture to remove.
3085  */
3086 static void remove_short_at_index(H264Context *h, int i){
3087     assert(i >= 0 && i < h->short_ref_count);
3088     h->short_ref[i]= NULL;
3089     if (--h->short_ref_count)
3090         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3091 }
3092
3093 /**
3094  *
3095  * @return the removed picture or NULL if an error occurs
3096  */
3097 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3098     MpegEncContext * const s = &h->s;
3099     Picture *pic;
3100     int i;
3101
3102     if(s->avctx->debug&FF_DEBUG_MMCO)
3103         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3104
3105     pic = find_short(h, frame_num, &i);
3106     if (pic){
3107         if(unreference_pic(h, pic, ref_mask))
3108         remove_short_at_index(h, i);
3109     }
3110
3111     return pic;
3112 }
3113
3114 /**
3115  * Remove a picture from the long term reference list by its index in
3116  * that list.
3117  * @return the removed picture or NULL if an error occurs
3118  */
3119 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3120     Picture *pic;
3121
3122     pic= h->long_ref[i];
3123     if (pic){
3124         if(unreference_pic(h, pic, ref_mask)){
3125             assert(h->long_ref[i]->long_ref == 1);
3126             h->long_ref[i]->long_ref= 0;
3127             h->long_ref[i]= NULL;
3128             h->long_ref_count--;
3129         }
3130     }
3131
3132     return pic;
3133 }
3134
3135 /**
3136  * print short term list
3137  */
3138 static void print_short_term(H264Context *h) {
3139     uint32_t i;
3140     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3141         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3142         for(i=0; i<h->short_ref_count; i++){
3143             Picture *pic= h->short_ref[i];
3144             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3145         }
3146     }
3147 }
3148
3149 /**
3150  * print long term list
3151  */
3152 static void print_long_term(H264Context *h) {
3153     uint32_t i;
3154     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3155         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3156         for(i = 0; i < 16; i++){
3157             Picture *pic= h->long_ref[i];
3158             if (pic) {
3159                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3160             }
3161         }
3162     }
3163 }
3164
3165 /**
3166  * Executes the reference picture marking (memory management control operations).
3167  */
3168 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3169     MpegEncContext * const s = &h->s;
3170     int i, j;
3171     int current_ref_assigned=0;
3172     Picture *pic;
3173
3174     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3175         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3176
3177     for(i=0; i<mmco_count; i++){
3178         int structure, frame_num;
3179         if(s->avctx->debug&FF_DEBUG_MMCO)
3180             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3181
3182         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3183            || mmco[i].opcode == MMCO_SHORT2LONG){
3184             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3185             pic = find_short(h, frame_num, &j);
3186             if(!pic){
3187                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3188                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3189                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3190                 continue;
3191             }
3192         }
3193
3194         switch(mmco[i].opcode){
3195         case MMCO_SHORT2UNUSED:
3196             if(s->avctx->debug&FF_DEBUG_MMCO)
3197                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3198             remove_short(h, frame_num, structure ^ PICT_FRAME);
3199             break;
3200         case MMCO_SHORT2LONG:
3201                 if (h->long_ref[mmco[i].long_arg] != pic)
3202                     remove_long(h, mmco[i].long_arg, 0);
3203
3204                 remove_short_at_index(h, j);
3205                 h->long_ref[ mmco[i].long_arg ]= pic;
3206                 if (h->long_ref[ mmco[i].long_arg ]){
3207                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3208                     h->long_ref_count++;
3209                 }
3210             break;
3211         case MMCO_LONG2UNUSED:
3212             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3213             pic = h->long_ref[j];
3214             if (pic) {
3215                 remove_long(h, j, structure ^ PICT_FRAME);
3216             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3217                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3218             break;
3219         case MMCO_LONG:
3220                     // Comment below left from previous code as it is an interresting note.
3221                     /* First field in pair is in short term list or
3222                      * at a different long term index.
3223                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3224                      * Report the problem and keep the pair where it is,
3225                      * and mark this field valid.
3226                      */
3227
3228             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3229                 remove_long(h, mmco[i].long_arg, 0);
3230
3231                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3232                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3233                 h->long_ref_count++;
3234             }
3235
3236             s->current_picture_ptr->reference |= s->picture_structure;
3237             current_ref_assigned=1;
3238             break;
3239         case MMCO_SET_MAX_LONG:
3240             assert(mmco[i].long_arg <= 16);
3241             // just remove the long term which index is greater than new max
3242             for(j = mmco[i].long_arg; j<16; j++){
3243                 remove_long(h, j, 0);
3244             }
3245             break;
3246         case MMCO_RESET:
3247             while(h->short_ref_count){
3248                 remove_short(h, h->short_ref[0]->frame_num, 0);
3249             }
3250             for(j = 0; j < 16; j++) {
3251                 remove_long(h, j, 0);
3252             }
3253             s->current_picture_ptr->poc=
3254             s->current_picture_ptr->field_poc[0]=
3255             s->current_picture_ptr->field_poc[1]=
3256             h->poc_lsb=
3257             h->poc_msb=
3258             h->frame_num=
3259             s->current_picture_ptr->frame_num= 0;
3260             break;
3261         default: assert(0);
3262         }
3263     }
3264
3265     if (!current_ref_assigned) {
3266         /* Second field of complementary field pair; the first field of
3267          * which is already referenced. If short referenced, it
3268          * should be first entry in short_ref. If not, it must exist
3269          * in long_ref; trying to put it on the short list here is an
3270          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3271          */
3272         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3273             /* Just mark the second field valid */
3274             s->current_picture_ptr->reference = PICT_FRAME;
3275         } else if (s->current_picture_ptr->long_ref) {
3276             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3277                                              "assignment for second field "
3278                                              "in complementary field pair "
3279                                              "(first field is long term)\n");
3280         } else {
3281             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3282             if(pic){
3283                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3284             }
3285
3286             if(h->short_ref_count)
3287                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3288
3289             h->short_ref[0]= s->current_picture_ptr;
3290             h->short_ref_count++;
3291             s->current_picture_ptr->reference |= s->picture_structure;
3292         }
3293     }
3294
3295     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3296
3297         /* We have too many reference frames, probably due to corrupted
3298          * stream. Need to discard one frame. Prevents overrun of the
3299          * short_ref and long_ref buffers.
3300          */
3301         av_log(h->s.avctx, AV_LOG_ERROR,
3302                "number of reference frames exceeds max (probably "
3303                "corrupt input), discarding one\n");
3304
3305         if (h->long_ref_count && !h->short_ref_count) {
3306             for (i = 0; i < 16; ++i)
3307                 if (h->long_ref[i])
3308                     break;
3309
3310             assert(i < 16);
3311             remove_long(h, i, 0);
3312         } else {
3313             pic = h->short_ref[h->short_ref_count - 1];
3314             remove_short(h, pic->frame_num, 0);
3315         }
3316     }
3317
3318     print_short_term(h);
3319     print_long_term(h);
3320     return 0;
3321 }
3322
3323 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3324     MpegEncContext * const s = &h->s;
3325     int i;
3326
3327     h->mmco_index= 0;
3328     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3329         s->broken_link= get_bits1(gb) -1;
3330         if(get_bits1(gb)){
3331             h->mmco[0].opcode= MMCO_LONG;
3332             h->mmco[0].long_arg= 0;
3333             h->mmco_index= 1;
3334         }
3335     }else{
3336         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3337             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3338                 MMCOOpcode opcode= get_ue_golomb(gb);
3339
3340                 h->mmco[i].opcode= opcode;
3341                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3342                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3343 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3344                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3345                         return -1;
3346                     }*/
3347                 }
3348                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3349                     unsigned int long_arg= get_ue_golomb(gb);
3350                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3351                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3352                         return -1;
3353                     }
3354                     h->mmco[i].long_arg= long_arg;
3355                 }
3356
3357                 if(opcode > (unsigned)MMCO_LONG){
3358                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3359                     return -1;
3360                 }
3361                 if(opcode == MMCO_END)
3362                     break;
3363             }
3364             h->mmco_index= i;
3365         }else{
3366             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3367
3368             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3369                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3370                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3371                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3372                 h->mmco_index= 1;
3373                 if (FIELD_PICTURE) {
3374                     h->mmco[0].short_pic_num *= 2;
3375                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3376                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3377                     h->mmco_index= 2;
3378                 }
3379             }
3380         }
3381     }
3382
3383     return 0;
3384 }
3385
3386 static int init_poc(H264Context *h){
3387     MpegEncContext * const s = &h->s;
3388     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3389     int field_poc[2];
3390     Picture *cur = s->current_picture_ptr;
3391
3392     h->frame_num_offset= h->prev_frame_num_offset;
3393     if(h->frame_num < h->prev_frame_num)
3394         h->frame_num_offset += max_frame_num;
3395
3396     if(h->sps.poc_type==0){
3397         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3398
3399         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3400             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3401         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3402             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3403         else
3404             h->poc_msb = h->prev_poc_msb;
3405 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3406         field_poc[0] =
3407         field_poc[1] = h->poc_msb + h->poc_lsb;
3408         if(s->picture_structure == PICT_FRAME)
3409             field_poc[1] += h->delta_poc_bottom;
3410     }else if(h->sps.poc_type==1){
3411         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3412         int i;
3413
3414         if(h->sps.poc_cycle_length != 0)
3415             abs_frame_num = h->frame_num_offset + h->frame_num;
3416         else
3417             abs_frame_num = 0;
3418
3419         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3420             abs_frame_num--;
3421
3422         expected_delta_per_poc_cycle = 0;
3423         for(i=0; i < h->sps.poc_cycle_length; i++)
3424             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3425
3426         if(abs_frame_num > 0){
3427             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3428             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3429
3430             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3431             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3432                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3433         } else
3434             expectedpoc = 0;
3435
3436         if(h->nal_ref_idc == 0)
3437             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3438
3439         field_poc[0] = expectedpoc + h->delta_poc[0];
3440         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3441
3442         if(s->picture_structure == PICT_FRAME)
3443             field_poc[1] += h->delta_poc[1];
3444     }else{
3445         int poc= 2*(h->frame_num_offset + h->frame_num);
3446
3447         if(!h->nal_ref_idc)
3448             poc--;
3449
3450         field_poc[0]= poc;
3451         field_poc[1]= poc;
3452     }
3453
3454     if(s->picture_structure != PICT_BOTTOM_FIELD)
3455         s->current_picture_ptr->field_poc[0]= field_poc[0];
3456     if(s->picture_structure != PICT_TOP_FIELD)
3457         s->current_picture_ptr->field_poc[1]= field_poc[1];
3458     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3459
3460     return 0;
3461 }
3462
3463
3464 /**
3465  * initialize scan tables
3466  */
3467 static void init_scan_tables(H264Context *h){
3468     MpegEncContext * const s = &h->s;
3469     int i;
3470     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3471         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3472         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3473     }else{
3474         for(i=0; i<16; i++){
3475 #define T(x) (x>>2) | ((x<<2) & 0xF)
3476             h->zigzag_scan[i] = T(zigzag_scan[i]);
3477             h-> field_scan[i] = T( field_scan[i]);
3478 #undef T
3479         }
3480     }
3481     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3482         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3483         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3484         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3485         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3486     }else{
3487         for(i=0; i<64; i++){
3488 #define T(x) (x>>3) | ((x&7)<<3)
3489             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3490             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3491             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3492             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3493 #undef T
3494         }
3495     }
3496     if(h->sps.transform_bypass){ //FIXME same ugly
3497         h->zigzag_scan_q0          = zigzag_scan;
3498         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3499         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3500         h->field_scan_q0           = field_scan;
3501         h->field_scan8x8_q0        = field_scan8x8;
3502         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3503     }else{
3504         h->zigzag_scan_q0          = h->zigzag_scan;
3505         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3506         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3507         h->field_scan_q0           = h->field_scan;
3508         h->field_scan8x8_q0        = h->field_scan8x8;
3509         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3510     }
3511 }
3512
3513 /**
3514  * Replicates H264 "master" context to thread contexts.
3515  */
3516 static void clone_slice(H264Context *dst, H264Context *src)
3517 {
3518     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3519     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3520     dst->s.current_picture      = src->s.current_picture;
3521     dst->s.linesize             = src->s.linesize;
3522     dst->s.uvlinesize           = src->s.uvlinesize;
3523     dst->s.first_field          = src->s.first_field;
3524
3525     dst->prev_poc_msb           = src->prev_poc_msb;
3526     dst->prev_poc_lsb           = src->prev_poc_lsb;
3527     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3528     dst->prev_frame_num         = src->prev_frame_num;
3529     dst->short_ref_count        = src->short_ref_count;
3530
3531     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3532     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3533     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3534     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3535
3536     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3537     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3538 }
3539
3540 /**
3541  * decodes a slice header.
3542  * This will also call MPV_common_init() and frame_start() as needed.
3543  *
3544  * @param h h264context
3545  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3546  *
3547  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3548  */
3549 static int decode_slice_header(H264Context *h, H264Context *h0){
3550     MpegEncContext * const s = &h->s;
3551     MpegEncContext * const s0 = &h0->s;
3552     unsigned int first_mb_in_slice;
3553     unsigned int pps_id;
3554     int num_ref_idx_active_override_flag;
3555     unsigned int slice_type, tmp, i, j;
3556     int default_ref_list_done = 0;
3557     int last_pic_structure;
3558
3559     s->dropable= h->nal_ref_idc == 0;
3560
3561     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3562         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3563         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3564     }else{
3565         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3566         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3567     }
3568
3569     first_mb_in_slice= get_ue_golomb(&s->gb);
3570
3571     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3572         h0->current_slice = 0;
3573         if (!s0->first_field)
3574             s->current_picture_ptr= NULL;
3575     }
3576
3577     slice_type= get_ue_golomb(&s->gb);
3578     if(slice_type > 9){
3579         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3580         return -1;
3581     }
3582     if(slice_type > 4){
3583         slice_type -= 5;
3584         h->slice_type_fixed=1;
3585     }else
3586         h->slice_type_fixed=0;
3587
3588     slice_type= golomb_to_pict_type[ slice_type ];
3589     if (slice_type == FF_I_TYPE
3590         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3591         default_ref_list_done = 1;
3592     }
3593     h->slice_type= slice_type;
3594     h->slice_type_nos= slice_type & 3;
3595
3596     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3597     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3598         av_log(h->s.avctx, AV_LOG_ERROR,
3599                "B picture before any references, skipping\n");
3600         return -1;
3601     }
3602
3603     pps_id= get_ue_golomb(&s->gb);
3604     if(pps_id>=MAX_PPS_COUNT){
3605         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3606         return -1;
3607     }
3608     if(!h0->pps_buffers[pps_id]) {
3609         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3610         return -1;
3611     }
3612     h->pps= *h0->pps_buffers[pps_id];
3613
3614     if(!h0->sps_buffers[h->pps.sps_id]) {
3615         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3616         return -1;
3617     }
3618     h->sps = *h0->sps_buffers[h->pps.sps_id];
3619
3620     if(h == h0 && h->dequant_coeff_pps != pps_id){
3621         h->dequant_coeff_pps = pps_id;
3622         init_dequant_tables(h);
3623     }
3624
3625     s->mb_width= h->sps.mb_width;
3626     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3627
3628     h->b_stride=  s->mb_width*4;
3629     h->b8_stride= s->mb_width*2;
3630
3631     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3632     if(h->sps.frame_mbs_only_flag)
3633         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3634     else
3635         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3636
3637     if (s->context_initialized
3638         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3639         if(h != h0)
3640             return -1;   // width / height changed during parallelized decoding
3641         free_tables(h);
3642         flush_dpb(s->avctx);
3643         MPV_common_end(s);
3644     }
3645     if (!s->context_initialized) {
3646         if(h != h0)
3647             return -1;  // we cant (re-)initialize context during parallel decoding
3648         if (MPV_common_init(s) < 0)
3649             return -1;
3650         s->first_field = 0;
3651
3652         init_scan_tables(h);
3653         alloc_tables(h);
3654
3655         for(i = 1; i < s->avctx->thread_count; i++) {
3656             H264Context *c;
3657             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3658             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3659             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3660             c->sps = h->sps;
3661             c->pps = h->pps;
3662             init_scan_tables(c);
3663             clone_tables(c, h);
3664         }
3665
3666         for(i = 0; i < s->avctx->thread_count; i++)
3667             if(context_init(h->thread_context[i]) < 0)
3668                 return -1;
3669
3670         s->avctx->width = s->width;
3671         s->avctx->height = s->height;
3672         s->avctx->sample_aspect_ratio= h->sps.sar;
3673         if(!s->avctx->sample_aspect_ratio.den)
3674             s->avctx->sample_aspect_ratio.den = 1;
3675
3676         if(h->sps.timing_info_present_flag){
3677             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3678             if(h->x264_build > 0 && h->x264_build < 44)
3679                 s->avctx->time_base.den *= 2;
3680             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3681                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3682         }
3683     }
3684
3685     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3686
3687     h->mb_mbaff = 0;
3688     h->mb_aff_frame = 0;
3689     last_pic_structure = s0->picture_structure;
3690     if(h->sps.frame_mbs_only_flag){
3691         s->picture_structure= PICT_FRAME;
3692     }else{
3693         if(get_bits1(&s->gb)) { //field_pic_flag
3694             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3695         } else {
3696             s->picture_structure= PICT_FRAME;
3697             h->mb_aff_frame = h->sps.mb_aff;
3698         }
3699     }
3700     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3701
3702     if(h0->current_slice == 0){
3703         while(h->frame_num !=  h->prev_frame_num &&
3704               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3705             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3706             frame_start(h);
3707             h->prev_frame_num++;
3708             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3709             s->current_picture_ptr->frame_num= h->prev_frame_num;
3710             execute_ref_pic_marking(h, NULL, 0);
3711         }
3712
3713         /* See if we have a decoded first field looking for a pair... */
3714         if (s0->first_field) {
3715             assert(s0->current_picture_ptr);
3716             assert(s0->current_picture_ptr->data[0]);
3717             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3718
3719             /* figure out if we have a complementary field pair */
3720             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3721                 /*
3722                  * Previous field is unmatched. Don't display it, but let it
3723                  * remain for reference if marked as such.
3724                  */
3725                 s0->current_picture_ptr = NULL;
3726                 s0->first_field = FIELD_PICTURE;
3727
3728             } else {
3729                 if (h->nal_ref_idc &&
3730                         s0->current_picture_ptr->reference &&
3731                         s0->current_picture_ptr->frame_num != h->frame_num) {
3732                     /*
3733                      * This and previous field were reference, but had
3734                      * different frame_nums. Consider this field first in
3735                      * pair. Throw away previous field except for reference
3736                      * purposes.
3737                      */
3738                     s0->first_field = 1;
3739                     s0->current_picture_ptr = NULL;
3740
3741                 } else {
3742                     /* Second field in complementary pair */
3743                     s0->first_field = 0;
3744                 }
3745             }
3746
3747         } else {
3748             /* Frame or first field in a potentially complementary pair */
3749             assert(!s0->current_picture_ptr);
3750             s0->first_field = FIELD_PICTURE;
3751         }
3752
3753         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3754             s0->first_field = 0;
3755             return -1;
3756         }
3757     }
3758     if(h != h0)
3759         clone_slice(h, h0);
3760
3761     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3762
3763     assert(s->mb_num == s->mb_width * s->mb_height);
3764     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3765        first_mb_in_slice                    >= s->mb_num){
3766         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3767         return -1;
3768     }
3769     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3770     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3771     if (s->picture_structure == PICT_BOTTOM_FIELD)
3772         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3773     assert(s->mb_y < s->mb_height);
3774
3775     if(s->picture_structure==PICT_FRAME){
3776         h->curr_pic_num=   h->frame_num;
3777         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3778     }else{
3779         h->curr_pic_num= 2*h->frame_num + 1;
3780         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3781     }
3782
3783     if(h->nal_unit_type == NAL_IDR_SLICE){
3784         get_ue_golomb(&s->gb); /* idr_pic_id */
3785     }
3786
3787     if(h->sps.poc_type==0){
3788         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3789
3790         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3791             h->delta_poc_bottom= get_se_golomb(&s->gb);
3792         }
3793     }
3794
3795     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3796         h->delta_poc[0]= get_se_golomb(&s->gb);
3797
3798         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3799             h->delta_poc[1]= get_se_golomb(&s->gb);
3800     }
3801
3802     init_poc(h);
3803
3804     if(h->pps.redundant_pic_cnt_present){
3805         h->redundant_pic_count= get_ue_golomb(&s->gb);
3806     }
3807
3808     //set defaults, might be overridden a few lines later
3809     h->ref_count[0]= h->pps.ref_count[0];
3810     h->ref_count[1]= h->pps.ref_count[1];
3811
3812     if(h->slice_type_nos != FF_I_TYPE){
3813         if(h->slice_type_nos == FF_B_TYPE){
3814             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3815         }
3816         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3817
3818         if(num_ref_idx_active_override_flag){
3819             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3820             if(h->slice_type_nos==FF_B_TYPE)
3821                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3822
3823             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3824                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3825                 h->ref_count[0]= h->ref_count[1]= 1;
3826                 return -1;
3827             }
3828         }
3829         if(h->slice_type_nos == FF_B_TYPE)
3830             h->list_count= 2;
3831         else
3832             h->list_count= 1;
3833     }else
3834         h->list_count= 0;
3835
3836     if(!default_ref_list_done){
3837         fill_default_ref_list(h);
3838     }
3839
3840     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3841         return -1;
3842
3843     if(h->slice_type_nos!=FF_I_TYPE){
3844         s->last_picture_ptr= &h->ref_list[0][0];
3845         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3846     }
3847     if(h->slice_type_nos==FF_B_TYPE){
3848         s->next_picture_ptr= &h->ref_list[1][0];
3849         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3850     }
3851
3852     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3853        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3854         pred_weight_table(h);
3855     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3856         implicit_weight_table(h);
3857     else
3858         h->use_weight = 0;
3859
3860     if(h->nal_ref_idc)
3861         decode_ref_pic_marking(h0, &s->gb);
3862
3863     if(FRAME_MBAFF)
3864         fill_mbaff_ref_list(h);
3865
3866     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3867         direct_dist_scale_factor(h);
3868     direct_ref_list_init(h);
3869
3870     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3871         tmp = get_ue_golomb(&s->gb);
3872         if(tmp > 2){
3873             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3874             return -1;
3875         }
3876         h->cabac_init_idc= tmp;
3877     }
3878
3879     h->last_qscale_diff = 0;
3880     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3881     if(tmp>51){
3882         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3883         return -1;
3884     }
3885     s->qscale= tmp;
3886     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3887     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3888     //FIXME qscale / qp ... stuff
3889     if(h->slice_type == FF_SP_TYPE){
3890         get_bits1(&s->gb); /* sp_for_switch_flag */
3891     }
3892     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3893         get_se_golomb(&s->gb); /* slice_qs_delta */
3894     }
3895
3896     h->deblocking_filter = 1;
3897     h->slice_alpha_c0_offset = 0;
3898     h->slice_beta_offset = 0;
3899     if( h->pps.deblocking_filter_parameters_present ) {
3900         tmp= get_ue_golomb(&s->gb);
3901         if(tmp > 2){
3902             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3903             return -1;
3904         }
3905         h->deblocking_filter= tmp;
3906         if(h->deblocking_filter < 2)
3907             h->deblocking_filter^= 1; // 1<->0
3908
3909         if( h->deblocking_filter ) {
3910             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3911             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3912         }
3913     }
3914
3915     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3916        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3917        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3918        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3919         h->deblocking_filter= 0;
3920
3921     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3922         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3923             /* Cheat slightly for speed:
3924                Do not bother to deblock across slices. */
3925             h->deblocking_filter = 2;
3926         } else {
3927             h0->max_contexts = 1;
3928             if(!h0->single_decode_warning) {
3929                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3930                 h0->single_decode_warning = 1;
3931             }
3932             if(h != h0)
3933                 return 1; // deblocking switched inside frame
3934         }
3935     }
3936
3937 #if 0 //FMO
3938     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3939         slice_group_change_cycle= get_bits(&s->gb, ?);
3940 #endif
3941
3942     h0->last_slice_type = slice_type;
3943     h->slice_num = ++h0->current_slice;
3944     if(h->slice_num >= MAX_SLICES){
3945         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3946     }
3947
3948     for(j=0; j<2; j++){
3949         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3950         ref2frm[0]=
3951         ref2frm[1]= -1;
3952         for(i=0; i<16; i++)
3953             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3954                           +(h->ref_list[j][i].reference&3);
3955         ref2frm[18+0]=
3956         ref2frm[18+1]= -1;
3957         for(i=16; i<48; i++)
3958             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3959                           +(h->ref_list[j][i].reference&3);
3960     }
3961
3962     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3963     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
3964
3965     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3966         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
3967                h->slice_num,
3968                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
3969                first_mb_in_slice,
3970                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
3971                pps_id, h->frame_num,
3972                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
3973                h->ref_count[0], h->ref_count[1],
3974                s->qscale,
3975                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
3976                h->use_weight,
3977                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
3978                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
3979                );
3980     }
3981
3982     return 0;
3983 }
3984
3985 /**
3986  *
3987  */
3988 static inline int get_level_prefix(GetBitContext *gb){
3989     unsigned int buf;
3990     int log;
3991
3992     OPEN_READER(re, gb);
3993     UPDATE_CACHE(re, gb);
3994     buf=GET_CACHE(re, gb);
3995
3996     log= 32 - av_log2(buf);
3997 #ifdef TRACE
3998     print_bin(buf>>(32-log), log);
3999     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4000 #endif
4001
4002     LAST_SKIP_BITS(re, gb, log);
4003     CLOSE_READER(re, gb);
4004
4005     return log-1;
4006 }
4007
4008 static inline int get_dct8x8_allowed(H264Context *h){
4009     int i;
4010     for(i=0; i<4; i++){
4011         if(!IS_SUB_8X8(h->sub_mb_type[i])
4012            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4013             return 0;
4014     }
4015     return 1;
4016 }
4017
4018 /**
4019  * decodes a residual block.
4020  * @param n block index
4021  * @param scantable scantable
4022  * @param max_coeff number of coefficients in the block
4023  * @return <0 if an error occurred
4024  */
4025 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4026     MpegEncContext * const s = &h->s;
4027     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4028     int level[16];
4029     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4030
4031     //FIXME put trailing_onex into the context
4032
4033     if(n == CHROMA_DC_BLOCK_INDEX){
4034         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4035         total_coeff= coeff_token>>2;
4036     }else{
4037         if(n == LUMA_DC_BLOCK_INDEX){
4038             total_coeff= pred_non_zero_count(h, 0);
4039             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4040             total_coeff= coeff_token>>2;
4041         }else{
4042             total_coeff= pred_non_zero_count(h, n);
4043             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4044             total_coeff= coeff_token>>2;
4045             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4046         }
4047     }
4048
4049     //FIXME set last_non_zero?
4050
4051     if(total_coeff==0)
4052         return 0;
4053     if(total_coeff > (unsigned)max_coeff) {
4054         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4055         return -1;
4056     }
4057
4058     trailing_ones= coeff_token&3;
4059     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4060     assert(total_coeff<=16);
4061
4062     for(i=0; i<trailing_ones; i++){
4063         level[i]= 1 - 2*get_bits1(gb);
4064     }
4065
4066     if(i<total_coeff) {
4067         int level_code, mask;
4068         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4069         int prefix= get_level_prefix(gb);
4070
4071         //first coefficient has suffix_length equal to 0 or 1
4072         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4073             if(suffix_length)
4074                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4075             else
4076                 level_code= (prefix<<suffix_length); //part
4077         }else if(prefix==14){
4078             if(suffix_length)
4079                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4080             else
4081                 level_code= prefix + get_bits(gb, 4); //part
4082         }else{
4083             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4084             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4085             if(prefix>=16)
4086                 level_code += (1<<(prefix-3))-4096;
4087         }
4088
4089         if(trailing_ones < 3) level_code += 2;
4090
4091         suffix_length = 1;
4092         if(level_code > 5)
4093             suffix_length++;
4094         mask= -(level_code&1);
4095         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4096         i++;
4097
4098         //remaining coefficients have suffix_length > 0
4099         for(;i<total_coeff;i++) {
4100             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4101             prefix = get_level_prefix(gb);
4102             if(prefix<15){
4103                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4104             }else{
4105                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4106                 if(prefix>=16)
4107                     level_code += (1<<(prefix-3))-4096;
4108             }
4109             mask= -(level_code&1);
4110             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4111             if(level_code > suffix_limit[suffix_length])
4112                 suffix_length++;
4113         }
4114     }
4115
4116     if(total_coeff == max_coeff)
4117         zeros_left=0;
4118     else{
4119         if(n == CHROMA_DC_BLOCK_INDEX)
4120             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4121         else
4122             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4123     }
4124
4125     coeff_num = zeros_left + total_coeff - 1;
4126     j = scantable[coeff_num];
4127     if(n > 24){
4128         block[j] = level[0];
4129         for(i=1;i<total_coeff;i++) {
4130             if(zeros_left <= 0)
4131                 run_before = 0;
4132             else if(zeros_left < 7){
4133                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4134             }else{
4135                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4136             }
4137             zeros_left -= run_before;
4138             coeff_num -= 1 + run_before;
4139             j= scantable[ coeff_num ];
4140
4141             block[j]= level[i];
4142         }
4143     }else{
4144         block[j] = (level[0] * qmul[j] + 32)>>6;
4145         for(i=1;i<total_coeff;i++) {
4146             if(zeros_left <= 0)
4147                 run_before = 0;
4148             else if(zeros_left < 7){
4149                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4150             }else{
4151                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4152             }
4153             zeros_left -= run_before;
4154             coeff_num -= 1 + run_before;
4155             j= scantable[ coeff_num ];
4156
4157             block[j]= (level[i] * qmul[j] + 32)>>6;
4158         }
4159     }
4160
4161     if(zeros_left<0){
4162         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4163         return -1;
4164     }
4165
4166     return 0;
4167 }
4168
4169 static void predict_field_decoding_flag(H264Context *h){
4170     MpegEncContext * const s = &h->s;
4171     const int mb_xy= h->mb_xy;
4172     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4173                 ? s->current_picture.mb_type[mb_xy-1]
4174                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4175                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4176                 : 0;
4177     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4178 }
4179
4180 /**
4181  * decodes a P_SKIP or B_SKIP macroblock
4182  */
4183 static void decode_mb_skip(H264Context *h){
4184     MpegEncContext * const s = &h->s;
4185     const int mb_xy= h->mb_xy;
4186     int mb_type=0;
4187
4188     memset(h->non_zero_count[mb_xy], 0, 16);
4189     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4190
4191     if(MB_FIELD)
4192         mb_type|= MB_TYPE_INTERLACED;
4193
4194     if( h->slice_type_nos == FF_B_TYPE )
4195     {
4196         // just for fill_caches. pred_direct_motion will set the real mb_type
4197         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4198
4199         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4200         pred_direct_motion(h, &mb_type);
4201         mb_type|= MB_TYPE_SKIP;
4202     }
4203     else
4204     {
4205         int mx, my;
4206         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4207
4208         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4209         pred_pskip_motion(h, &mx, &my);
4210         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4211         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4212     }
4213
4214     write_back_motion(h, mb_type);
4215     s->current_picture.mb_type[mb_xy]= mb_type;
4216     s->current_picture.qscale_table[mb_xy]= s->qscale;
4217     h->slice_table[ mb_xy ]= h->slice_num;
4218     h->prev_mb_skipped= 1;
4219 }
4220
4221 /**
4222  * decodes a macroblock
4223  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4224  */
4225 static int decode_mb_cavlc(H264Context *h){
4226     MpegEncContext * const s = &h->s;
4227     int mb_xy;
4228     int partition_count;
4229     unsigned int mb_type, cbp;
4230     int dct8x8_allowed= h->pps.transform_8x8_mode;
4231
4232     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4233
4234     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4235
4236     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4237     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4238                 down the code */
4239     if(h->slice_type_nos != FF_I_TYPE){
4240         if(s->mb_skip_run==-1)
4241             s->mb_skip_run= get_ue_golomb(&s->gb);
4242
4243         if (s->mb_skip_run--) {
4244             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4245                 if(s->mb_skip_run==0)
4246                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4247                 else
4248                     predict_field_decoding_flag(h);
4249             }
4250             decode_mb_skip(h);
4251             return 0;
4252         }
4253     }
4254     if(FRAME_MBAFF){
4255         if( (s->mb_y&1) == 0 )
4256             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4257     }
4258
4259     h->prev_mb_skipped= 0;
4260
4261     mb_type= get_ue_golomb(&s->gb);
4262     if(h->slice_type_nos == FF_B_TYPE){
4263         if(mb_type < 23){
4264             partition_count= b_mb_type_info[mb_type].partition_count;
4265             mb_type=         b_mb_type_info[mb_type].type;
4266         }else{
4267             mb_type -= 23;
4268             goto decode_intra_mb;
4269         }
4270     }else if(h->slice_type_nos == FF_P_TYPE){
4271         if(mb_type < 5){
4272             partition_count= p_mb_type_info[mb_type].partition_count;
4273             mb_type=         p_mb_type_info[mb_type].type;
4274         }else{
4275             mb_type -= 5;
4276             goto decode_intra_mb;
4277         }
4278     }else{
4279        assert(h->slice_type_nos == FF_I_TYPE);
4280         if(h->slice_type == FF_SI_TYPE && mb_type)
4281             mb_type--;
4282 decode_intra_mb:
4283         if(mb_type > 25){
4284             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4285             return -1;
4286         }
4287         partition_count=0;
4288         cbp= i_mb_type_info[mb_type].cbp;
4289         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4290         mb_type= i_mb_type_info[mb_type].type;
4291     }
4292
4293     if(MB_FIELD)
4294         mb_type |= MB_TYPE_INTERLACED;
4295
4296     h->slice_table[ mb_xy ]= h->slice_num;
4297
4298     if(IS_INTRA_PCM(mb_type)){
4299         unsigned int x;
4300
4301         // We assume these blocks are very rare so we do not optimize it.
4302         align_get_bits(&s->gb);
4303
4304         // The pixels are stored in the same order as levels in h->mb array.
4305         for(x=0; x < (CHROMA ? 384 : 256); x++){
4306             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4307         }
4308
4309         // In deblocking, the quantizer is 0
4310         s->current_picture.qscale_table[mb_xy]= 0;
4311         // All coeffs are present
4312         memset(h->non_zero_count[mb_xy], 16, 16);
4313
4314         s->current_picture.mb_type[mb_xy]= mb_type;
4315         return 0;
4316     }
4317
4318     if(MB_MBAFF){
4319         h->ref_count[0] <<= 1;
4320         h->ref_count[1] <<= 1;
4321     }
4322
4323     fill_caches(h, mb_type, 0);
4324
4325     //mb_pred
4326     if(IS_INTRA(mb_type)){
4327         int pred_mode;
4328 //            init_top_left_availability(h);
4329         if(IS_INTRA4x4(mb_type)){
4330             int i;
4331             int di = 1;
4332             if(dct8x8_allowed && get_bits1(&s->gb)){
4333                 mb_type |= MB_TYPE_8x8DCT;
4334                 di = 4;
4335             }
4336
4337 //                fill_intra4x4_pred_table(h);
4338             for(i=0; i<16; i+=di){
4339                 int mode= pred_intra_mode(h, i);
4340
4341                 if(!get_bits1(&s->gb)){
4342                     const int rem_mode= get_bits(&s->gb, 3);
4343                     mode = rem_mode + (rem_mode >= mode);
4344                 }
4345
4346                 if(di==4)
4347                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4348                 else
4349                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4350             }
4351             write_back_intra_pred_mode(h);
4352             if( check_intra4x4_pred_mode(h) < 0)
4353                 return -1;
4354         }else{
4355             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4356             if(h->intra16x16_pred_mode < 0)
4357                 return -1;
4358         }
4359         if(CHROMA){
4360             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4361             if(pred_mode < 0)
4362                 return -1;
4363             h->chroma_pred_mode= pred_mode;
4364         }
4365     }else if(partition_count==4){
4366         int i, j, sub_partition_count[4], list, ref[2][4];
4367
4368         if(h->slice_type_nos == FF_B_TYPE){
4369             for(i=0; i<4; i++){
4370                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4371                 if(h->sub_mb_type[i] >=13){
4372                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4373                     return -1;
4374                 }
4375                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4376                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4377             }
4378             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4379                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4380                 pred_direct_motion(h, &mb_type);
4381                 h->ref_cache[0][scan8[4]] =
4382                 h->ref_cache[1][scan8[4]] =
4383                 h->ref_cache[0][scan8[12]] =
4384                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4385             }
4386         }else{
4387             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4388             for(i=0; i<4; i++){
4389                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4390                 if(h->sub_mb_type[i] >=4){
4391                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4392                     return -1;
4393                 }
4394                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4395                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4396             }
4397         }
4398
4399         for(list=0; list<h->list_count; list++){
4400             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4401             for(i=0; i<4; i++){
4402                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4403                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4404                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4405                     if(tmp>=ref_count){
4406                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4407                         return -1;
4408                     }
4409                     ref[list][i]= tmp;
4410                 }else{
4411                  //FIXME
4412                     ref[list][i] = -1;
4413                 }
4414             }
4415         }
4416
4417         if(dct8x8_allowed)
4418             dct8x8_allowed = get_dct8x8_allowed(h);
4419
4420         for(list=0; list<h->list_count; list++){
4421             for(i=0; i<4; i++){
4422                 if(IS_DIRECT(h->sub_mb_type[i])) {
4423                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4424                     continue;
4425                 }
4426                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4427                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4428
4429                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4430                     const int sub_mb_type= h->sub_mb_type[i];
4431                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4432                     for(j=0; j<sub_partition_count[i]; j++){
4433                         int mx, my;
4434                         const int index= 4*i + block_width*j;
4435                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4436                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4437                         mx += get_se_golomb(&s->gb);
4438                         my += get_se_golomb(&s->gb);
4439                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4440
4441                         if(IS_SUB_8X8(sub_mb_type)){
4442                             mv_cache[ 1 ][0]=
4443                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4444                             mv_cache[ 1 ][1]=
4445                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4446                         }else if(IS_SUB_8X4(sub_mb_type)){
4447                             mv_cache[ 1 ][0]= mx;
4448                             mv_cache[ 1 ][1]= my;
4449                         }else if(IS_SUB_4X8(sub_mb_type)){
4450                             mv_cache[ 8 ][0]= mx;
4451                             mv_cache[ 8 ][1]= my;
4452                         }
4453                         mv_cache[ 0 ][0]= mx;
4454                         mv_cache[ 0 ][1]= my;
4455                     }
4456                 }else{
4457                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4458                     p[0] = p[1]=
4459                     p[8] = p[9]= 0;
4460                 }
4461             }
4462         }
4463     }else if(IS_DIRECT(mb_type)){
4464         pred_direct_motion(h, &mb_type);
4465         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4466     }else{
4467         int list, mx, my, i;
4468          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4469         if(IS_16X16(mb_type)){
4470             for(list=0; list<h->list_count; list++){
4471                     unsigned int val;
4472                     if(IS_DIR(mb_type, 0, list)){
4473                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4474                         if(val >= h->ref_count[list]){
4475                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4476                             return -1;
4477                         }
4478                     }else
4479                         val= LIST_NOT_USED&0xFF;
4480                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4481             }
4482             for(list=0; list<h->list_count; list++){
4483                 unsigned int val;
4484                 if(IS_DIR(mb_type, 0, list)){
4485                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4486                     mx += get_se_golomb(&s->gb);
4487                     my += get_se_golomb(&s->gb);
4488                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4489
4490                     val= pack16to32(mx,my);
4491                 }else
4492                     val=0;
4493                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4494             }
4495         }
4496         else if(IS_16X8(mb_type)){
4497             for(list=0; list<h->list_count; list++){
4498                     for(i=0; i<2; i++){
4499                         unsigned int val;
4500                         if(IS_DIR(mb_type, i, list)){
4501                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4502                             if(val >= h->ref_count[list]){
4503                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4504                                 return -1;
4505                             }
4506                         }else
4507                             val= LIST_NOT_USED&0xFF;
4508                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4509                     }
4510             }
4511             for(list=0; list<h->list_count; list++){
4512                 for(i=0; i<2; i++){
4513                     unsigned int val;
4514                     if(IS_DIR(mb_type, i, list)){
4515                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4516                         mx += get_se_golomb(&s->gb);
4517                         my += get_se_golomb(&s->gb);
4518                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4519
4520                         val= pack16to32(mx,my);
4521                     }else
4522                         val=0;
4523                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4524                 }
4525             }
4526         }else{
4527             assert(IS_8X16(mb_type));
4528             for(list=0; list<h->list_count; list++){
4529                     for(i=0; i<2; i++){
4530                         unsigned int val;
4531                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4532                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4533                             if(val >= h->ref_count[list]){
4534                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4535                                 return -1;
4536                             }
4537                         }else
4538                             val= LIST_NOT_USED&0xFF;
4539                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4540                     }
4541             }
4542             for(list=0; list<h->list_count; list++){
4543                 for(i=0; i<2; i++){
4544                     unsigned int val;
4545                     if(IS_DIR(mb_type, i, list)){
4546                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4547                         mx += get_se_golomb(&s->gb);
4548                         my += get_se_golomb(&s->gb);
4549                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4550
4551                         val= pack16to32(mx,my);
4552                     }else
4553                         val=0;
4554                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4555                 }
4556             }
4557         }
4558     }
4559
4560     if(IS_INTER(mb_type))
4561         write_back_motion(h, mb_type);
4562
4563     if(!IS_INTRA16x16(mb_type)){
4564         cbp= get_ue_golomb(&s->gb);
4565         if(cbp > 47){
4566             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4567             return -1;
4568         }
4569
4570         if(CHROMA){
4571             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4572             else                     cbp= golomb_to_inter_cbp   [cbp];
4573         }else{
4574             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4575             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4576         }
4577     }
4578     h->cbp = cbp;
4579
4580     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4581         if(get_bits1(&s->gb)){
4582             mb_type |= MB_TYPE_8x8DCT;
4583             h->cbp_table[mb_xy]= cbp;
4584         }
4585     }
4586     s->current_picture.mb_type[mb_xy]= mb_type;
4587
4588     if(cbp || IS_INTRA16x16(mb_type)){
4589         int i8x8, i4x4, chroma_idx;
4590         int dquant;
4591         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4592         const uint8_t *scan, *scan8x8, *dc_scan;
4593
4594 //        fill_non_zero_count_cache(h);
4595
4596         if(IS_INTERLACED(mb_type)){
4597             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4598             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4599             dc_scan= luma_dc_field_scan;
4600         }else{
4601             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4602             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4603             dc_scan= luma_dc_zigzag_scan;
4604         }
4605
4606         dquant= get_se_golomb(&s->gb);
4607
4608         if( dquant > 25 || dquant < -26 ){
4609             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4610             return -1;
4611         }
4612
4613         s->qscale += dquant;
4614         if(((unsigned)s->qscale) > 51){
4615             if(s->qscale<0) s->qscale+= 52;
4616             else            s->qscale-= 52;
4617         }
4618
4619         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4620         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4621         if(IS_INTRA16x16(mb_type)){
4622             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4623                 return -1; //FIXME continue if partitioned and other return -1 too
4624             }
4625
4626             assert((cbp&15) == 0 || (cbp&15) == 15);
4627
4628             if(cbp&15){
4629                 for(i8x8=0; i8x8<4; i8x8++){
4630                     for(i4x4=0; i4x4<4; i4x4++){
4631                         const int index= i4x4 + 4*i8x8;
4632                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4633                             return -1;
4634                         }
4635                     }
4636                 }
4637             }else{
4638                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4639             }
4640         }else{
4641             for(i8x8=0; i8x8<4; i8x8++){
4642                 if(cbp & (1<<i8x8)){
4643                     if(IS_8x8DCT(mb_type)){
4644                         DCTELEM *buf = &h->mb[64*i8x8];
4645                         uint8_t *nnz;
4646                         for(i4x4=0; i4x4<4; i4x4++){
4647                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4648                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4649                                 return -1;
4650                         }
4651                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4652                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4653                     }else{
4654                         for(i4x4=0; i4x4<4; i4x4++){
4655                             const int index= i4x4 + 4*i8x8;
4656
4657                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4658                                 return -1;
4659                             }
4660                         }
4661                     }
4662                 }else{
4663                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4664                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4665                 }
4666             }
4667         }
4668
4669         if(cbp&0x30){
4670             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4671                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4672                     return -1;
4673                 }
4674         }
4675
4676         if(cbp&0x20){
4677             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4678                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4679                 for(i4x4=0; i4x4<4; i4x4++){
4680                     const int index= 16 + 4*chroma_idx + i4x4;
4681                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4682                         return -1;
4683                     }
4684                 }
4685             }
4686         }else{
4687             uint8_t * const nnz= &h->non_zero_count_cache[0];
4688             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4689             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4690         }
4691     }else{
4692         uint8_t * const nnz= &h->non_zero_count_cache[0];
4693         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4694         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4695         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4696     }
4697     s->current_picture.qscale_table[mb_xy]= s->qscale;
4698     write_back_non_zero_count(h);
4699
4700     if(MB_MBAFF){
4701         h->ref_count[0] >>= 1;
4702         h->ref_count[1] >>= 1;
4703     }
4704
4705     return 0;
4706 }
4707
4708 static int decode_cabac_field_decoding_flag(H264Context *h) {
4709     MpegEncContext * const s = &h->s;
4710     const int mb_x = s->mb_x;
4711     const int mb_y = s->mb_y & ~1;
4712     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4713     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4714
4715     unsigned int ctx = 0;
4716
4717     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4718         ctx += 1;
4719     }
4720     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4721         ctx += 1;
4722     }
4723
4724     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4725 }
4726
4727 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4728     uint8_t *state= &h->cabac_state[ctx_base];
4729     int mb_type;
4730
4731     if(intra_slice){
4732         MpegEncContext * const s = &h->s;
4733         const int mba_xy = h->left_mb_xy[0];
4734         const int mbb_xy = h->top_mb_xy;
4735         int ctx=0;
4736         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4737             ctx++;
4738         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4739             ctx++;
4740         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4741             return 0;   /* I4x4 */
4742         state += 2;
4743     }else{
4744         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4745             return 0;   /* I4x4 */
4746     }
4747
4748     if( get_cabac_terminate( &h->cabac ) )
4749         return 25;  /* PCM */
4750
4751     mb_type = 1; /* I16x16 */
4752     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4753     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4754         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4755     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4756     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4757     return mb_type;
4758 }
4759
4760 static int decode_cabac_mb_type( H264Context *h ) {
4761     MpegEncContext * const s = &h->s;
4762
4763     if( h->slice_type_nos == FF_I_TYPE ) {
4764         return decode_cabac_intra_mb_type(h, 3, 1);
4765     } else if( h->slice_type_nos == FF_P_TYPE ) {
4766         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4767             /* P-type */
4768             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4769                 /* P_L0_D16x16, P_8x8 */
4770                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4771             } else {
4772                 /* P_L0_D8x16, P_L0_D16x8 */
4773                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4774             }
4775         } else {
4776             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4777         }
4778     } else if( h->slice_type_nos == FF_B_TYPE ) {
4779         const int mba_xy = h->left_mb_xy[0];
4780         const int mbb_xy = h->top_mb_xy;
4781         int ctx = 0;
4782         int bits;
4783
4784         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4785             ctx++;
4786         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4787             ctx++;
4788
4789         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4790             return 0; /* B_Direct_16x16 */
4791
4792         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4793             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4794         }
4795
4796         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4797         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4798         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4799         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4800         if( bits < 8 )
4801             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4802         else if( bits == 13 ) {
4803             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4804         } else if( bits == 14 )
4805             return 11; /* B_L1_L0_8x16 */
4806         else if( bits == 15 )
4807             return 22; /* B_8x8 */
4808
4809         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4810         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4811     } else {
4812         /* TODO SI/SP frames? */
4813         return -1;
4814     }
4815 }
4816
4817 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4818     MpegEncContext * const s = &h->s;
4819     int mba_xy, mbb_xy;
4820     int ctx = 0;
4821
4822     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4823         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4824         mba_xy = mb_xy - 1;
4825         if( (mb_y&1)
4826             && h->slice_table[mba_xy] == h->slice_num
4827             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4828             mba_xy += s->mb_stride;
4829         if( MB_FIELD ){
4830             mbb_xy = mb_xy - s->mb_stride;
4831             if( !(mb_y&1)
4832                 && h->slice_table[mbb_xy] == h->slice_num
4833                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4834                 mbb_xy -= s->mb_stride;
4835         }else
4836             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4837     }else{
4838         int mb_xy = h->mb_xy;
4839         mba_xy = mb_xy - 1;
4840         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4841     }
4842
4843     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4844         ctx++;
4845     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4846         ctx++;
4847
4848     if( h->slice_type_nos == FF_B_TYPE )
4849         ctx += 13;
4850     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4851 }
4852
4853 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4854     int mode = 0;
4855
4856     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4857         return pred_mode;
4858
4859     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4860     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4861     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4862
4863     if( mode >= pred_mode )
4864         return mode + 1;
4865     else
4866         return mode;
4867 }
4868
4869 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4870     const int mba_xy = h->left_mb_xy[0];
4871     const int mbb_xy = h->top_mb_xy;
4872
4873     int ctx = 0;
4874
4875     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4876     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4877         ctx++;
4878
4879     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4880         ctx++;
4881
4882     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4883         return 0;
4884
4885     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4886         return 1;
4887     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4888         return 2;
4889     else
4890         return 3;
4891 }
4892
4893 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4894     int cbp_b, cbp_a, ctx, cbp = 0;
4895
4896     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4897     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4898
4899     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4900     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4901     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4902     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4903     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4904     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4905     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4906     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4907     return cbp;
4908 }
4909 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4910     int ctx;
4911     int cbp_a, cbp_b;
4912
4913     cbp_a = (h->left_cbp>>4)&0x03;
4914     cbp_b = (h-> top_cbp>>4)&0x03;
4915
4916     ctx = 0;
4917     if( cbp_a > 0 ) ctx++;
4918     if( cbp_b > 0 ) ctx += 2;
4919     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4920         return 0;
4921
4922     ctx = 4;
4923     if( cbp_a == 2 ) ctx++;
4924     if( cbp_b == 2 ) ctx += 2;
4925     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4926 }
4927 static int decode_cabac_mb_dqp( H264Context *h) {
4928     int   ctx = 0;
4929     int   val = 0;
4930
4931     if( h->last_qscale_diff != 0 )
4932         ctx++;
4933
4934     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4935         if( ctx < 2 )
4936             ctx = 2;
4937         else
4938             ctx = 3;
4939         val++;
4940         if(val > 102) //prevent infinite loop
4941             return INT_MIN;
4942     }
4943
4944     if( val&0x01 )
4945         return (val + 1)/2;
4946     else
4947         return -(val + 1)/2;
4948 }
4949 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4950     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4951         return 0;   /* 8x8 */
4952     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4953         return 1;   /* 8x4 */
4954     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4955         return 2;   /* 4x8 */
4956     return 3;       /* 4x4 */
4957 }
4958 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4959     int type;
4960     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4961         return 0;   /* B_Direct_8x8 */
4962     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4963         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4964     type = 3;
4965     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4966         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4967             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4968         type += 4;
4969     }
4970     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4971     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4972     return type;
4973 }
4974
4975 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
4976     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
4977 }
4978
4979 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
4980     int refa = h->ref_cache[list][scan8[n] - 1];
4981     int refb = h->ref_cache[list][scan8[n] - 8];
4982     int ref  = 0;
4983     int ctx  = 0;
4984
4985     if( h->slice_type_nos == FF_B_TYPE) {
4986         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
4987             ctx++;
4988         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
4989             ctx += 2;
4990     } else {
4991         if( refa > 0 )
4992             ctx++;
4993         if( refb > 0 )
4994             ctx += 2;
4995     }
4996
4997     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
4998         ref++;
4999         if( ctx < 4 )
5000             ctx = 4;
5001         else
5002             ctx = 5;
5003         if(ref >= 32 /*h->ref_list[list]*/){
5004             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5005             return 0; //FIXME we should return -1 and check the return everywhere
5006         }
5007     }
5008     return ref;
5009 }
5010
5011 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5012     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5013                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5014     int ctxbase = (l == 0) ? 40 : 47;
5015     int ctx, mvd;
5016
5017     if( amvd < 3 )
5018         ctx = 0;
5019     else if( amvd > 32 )
5020         ctx = 2;
5021     else
5022         ctx = 1;
5023
5024     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5025         return 0;
5026
5027     mvd= 1;
5028     ctx= 3;
5029     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5030         mvd++;
5031         if( ctx < 6 )
5032             ctx++;
5033     }
5034
5035     if( mvd >= 9 ) {
5036         int k = 3;
5037         while( get_cabac_bypass( &h->cabac ) ) {
5038             mvd += 1 << k;
5039             k++;
5040             if(k>24){
5041                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5042                 return INT_MIN;
5043             }
5044         }
5045         while( k-- ) {
5046             if( get_cabac_bypass( &h->cabac ) )
5047                 mvd += 1 << k;
5048         }
5049     }
5050     return get_cabac_bypass_sign( &h->cabac, -mvd );
5051 }
5052
5053 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5054     int nza, nzb;
5055     int ctx = 0;
5056
5057     if( is_dc ) {
5058         if( cat == 0 ) {
5059             nza = h->left_cbp&0x100;
5060             nzb = h-> top_cbp&0x100;
5061         } else {
5062             nza = (h->left_cbp>>(6+idx))&0x01;
5063             nzb = (h-> top_cbp>>(6+idx))&0x01;
5064         }
5065     } else {
5066         if( cat == 4 ) {
5067             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5068             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5069         } else {
5070             assert(cat == 1 || cat == 2);
5071             nza = h->non_zero_count_cache[scan8[idx] - 1];
5072             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5073         }
5074     }
5075
5076     if( nza > 0 )
5077         ctx++;
5078
5079     if( nzb > 0 )
5080         ctx += 2;
5081
5082     return ctx + 4 * cat;
5083 }
5084
5085 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5086     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5087     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5088     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5089     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5090 };
5091
5092 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5093     static const int significant_coeff_flag_offset[2][6] = {
5094       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5095       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5096     };
5097     static const int last_coeff_flag_offset[2][6] = {
5098       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5099       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5100     };
5101     static const int coeff_abs_level_m1_offset[6] = {
5102         227+0, 227+10, 227+20, 227+30, 227+39, 426
5103     };
5104     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5105       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5106         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5107         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5108        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5109       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5110         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5111         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5112         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5113     };
5114     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5115      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5116      * map node ctx => cabac ctx for level=1 */
5117     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5118     /* map node ctx => cabac ctx for level>1 */
5119     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5120     static const uint8_t coeff_abs_level_transition[2][8] = {
5121     /* update node ctx after decoding a level=1 */
5122         { 1, 2, 3, 3, 4, 5, 6, 7 },
5123     /* update node ctx after decoding a level>1 */
5124         { 4, 4, 4, 4, 5, 6, 7, 7 }
5125     };
5126
5127     int index[64];
5128
5129     int av_unused last;
5130     int coeff_count = 0;
5131     int node_ctx = 0;
5132
5133     uint8_t *significant_coeff_ctx_base;
5134     uint8_t *last_coeff_ctx_base;
5135     uint8_t *abs_level_m1_ctx_base;
5136
5137 #ifndef ARCH_X86
5138 #define CABAC_ON_STACK
5139 #endif
5140 #ifdef CABAC_ON_STACK
5141 #define CC &cc
5142     CABACContext cc;
5143     cc.range     = h->cabac.range;
5144     cc.low       = h->cabac.low;
5145     cc.bytestream= h->cabac.bytestream;
5146 #else
5147 #define CC &h->cabac
5148 #endif
5149
5150
5151     /* cat: 0-> DC 16x16  n = 0
5152      *      1-> AC 16x16  n = luma4x4idx
5153      *      2-> Luma4x4   n = luma4x4idx
5154      *      3-> DC Chroma n = iCbCr
5155      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5156      *      5-> Luma8x8   n = 4 * luma8x8idx
5157      */
5158
5159     /* read coded block flag */
5160     if( is_dc || cat != 5 ) {
5161         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5162             if( !is_dc ) {
5163                 if( cat == 4 )
5164                     h->non_zero_count_cache[scan8[16+n]] = 0;
5165                 else
5166                     h->non_zero_count_cache[scan8[n]] = 0;
5167             }
5168
5169 #ifdef CABAC_ON_STACK
5170             h->cabac.range     = cc.range     ;
5171             h->cabac.low       = cc.low       ;
5172             h->cabac.bytestream= cc.bytestream;
5173 #endif
5174             return;
5175         }
5176     }
5177
5178     significant_coeff_ctx_base = h->cabac_state
5179         + significant_coeff_flag_offset[MB_FIELD][cat];
5180     last_coeff_ctx_base = h->cabac_state
5181         + last_coeff_flag_offset[MB_FIELD][cat];
5182     abs_level_m1_ctx_base = h->cabac_state
5183         + coeff_abs_level_m1_offset[cat];
5184
5185     if( !is_dc && cat == 5 ) {
5186 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5187         for(last= 0; last < coefs; last++) { \
5188             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5189             if( get_cabac( CC, sig_ctx )) { \
5190                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5191                 index[coeff_count++] = last; \
5192                 if( get_cabac( CC, last_ctx ) ) { \
5193                     last= max_coeff; \
5194                     break; \
5195                 } \
5196             } \
5197         }\
5198         if( last == max_coeff -1 ) {\
5199             index[coeff_count++] = last;\
5200         }
5201         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5202 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5203         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5204     } else {
5205         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5206 #else
5207         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5208     } else {
5209         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5210 #endif
5211     }
5212     assert(coeff_count > 0);
5213
5214     if( is_dc ) {
5215         if( cat == 0 )
5216             h->cbp_table[h->mb_xy] |= 0x100;
5217         else
5218             h->cbp_table[h->mb_xy] |= 0x40 << n;
5219     } else {
5220         if( cat == 5 )
5221             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5222         else if( cat == 4 )
5223             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5224         else {
5225             assert( cat == 1 || cat == 2 );
5226             h->non_zero_count_cache[scan8[n]] = coeff_count;
5227         }
5228     }
5229
5230     do {
5231         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5232
5233         int j= scantable[index[--coeff_count]];
5234
5235         if( get_cabac( CC, ctx ) == 0 ) {
5236             node_ctx = coeff_abs_level_transition[0][node_ctx];
5237             if( is_dc ) {
5238                 block[j] = get_cabac_bypass_sign( CC, -1);
5239             }else{
5240                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5241             }
5242         } else {
5243             int coeff_abs = 2;
5244             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5245             node_ctx = coeff_abs_level_transition[1][node_ctx];
5246
5247             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5248                 coeff_abs++;
5249             }
5250
5251             if( coeff_abs >= 15 ) {
5252                 int j = 0;
5253                 while( get_cabac_bypass( CC ) ) {
5254                     j++;
5255                 }
5256
5257                 coeff_abs=1;
5258                 while( j-- ) {
5259                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5260                 }
5261                 coeff_abs+= 14;
5262             }
5263
5264             if( is_dc ) {
5265                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5266             }else{
5267                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5268             }
5269         }
5270     } while( coeff_count );
5271 #ifdef CABAC_ON_STACK
5272             h->cabac.range     = cc.range     ;
5273             h->cabac.low       = cc.low       ;
5274             h->cabac.bytestream= cc.bytestream;
5275 #endif
5276
5277 }
5278
5279 #ifndef CONFIG_SMALL
5280 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5281     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5282 }
5283
5284 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5285     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5286 }
5287 #endif
5288
5289 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5290 #ifdef CONFIG_SMALL
5291     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5292 #else
5293     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5294     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5295 #endif
5296 }
5297
5298 static inline void compute_mb_neighbors(H264Context *h)
5299 {
5300     MpegEncContext * const s = &h->s;
5301     const int mb_xy  = h->mb_xy;
5302     h->top_mb_xy     = mb_xy - s->mb_stride;
5303     h->left_mb_xy[0] = mb_xy - 1;
5304     if(FRAME_MBAFF){
5305         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5306         const int top_pair_xy      = pair_xy     - s->mb_stride;
5307         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5308         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5309         const int curr_mb_frame_flag = !MB_FIELD;
5310         const int bottom = (s->mb_y & 1);
5311         if (bottom
5312                 ? !curr_mb_frame_flag // bottom macroblock
5313                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5314                 ) {
5315             h->top_mb_xy -= s->mb_stride;
5316         }
5317         if (left_mb_frame_flag != curr_mb_frame_flag) {
5318             h->left_mb_xy[0] = pair_xy - 1;
5319         }
5320     } else if (FIELD_PICTURE) {
5321         h->top_mb_xy -= s->mb_stride;
5322     }
5323     return;
5324 }
5325
5326 /**
5327  * decodes a macroblock
5328  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5329  */
5330 static int decode_mb_cabac(H264Context *h) {
5331     MpegEncContext * const s = &h->s;
5332     int mb_xy;
5333     int mb_type, partition_count, cbp = 0;
5334     int dct8x8_allowed= h->pps.transform_8x8_mode;
5335
5336     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5337
5338     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5339
5340     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5341     if( h->slice_type_nos != FF_I_TYPE ) {
5342         int skip;
5343         /* a skipped mb needs the aff flag from the following mb */
5344         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5345             predict_field_decoding_flag(h);
5346         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5347             skip = h->next_mb_skipped;
5348         else
5349             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5350         /* read skip flags */
5351         if( skip ) {
5352             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5353                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5354                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5355                 if(h->next_mb_skipped)
5356                     predict_field_decoding_flag(h);
5357                 else
5358                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5359             }
5360
5361             decode_mb_skip(h);
5362
5363             h->cbp_table[mb_xy] = 0;
5364             h->chroma_pred_mode_table[mb_xy] = 0;
5365             h->last_qscale_diff = 0;
5366
5367             return 0;
5368
5369         }
5370     }
5371     if(FRAME_MBAFF){
5372         if( (s->mb_y&1) == 0 )
5373             h->mb_mbaff =
5374             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5375     }
5376
5377     h->prev_mb_skipped = 0;
5378
5379     compute_mb_neighbors(h);
5380     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5381         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5382         return -1;
5383     }
5384
5385     if( h->slice_type_nos == FF_B_TYPE ) {
5386         if( mb_type < 23 ){
5387             partition_count= b_mb_type_info[mb_type].partition_count;
5388             mb_type=         b_mb_type_info[mb_type].type;
5389         }else{
5390             mb_type -= 23;
5391             goto decode_intra_mb;
5392         }
5393     } else if( h->slice_type_nos == FF_P_TYPE ) {
5394         if( mb_type < 5) {
5395             partition_count= p_mb_type_info[mb_type].partition_count;
5396             mb_type=         p_mb_type_info[mb_type].type;
5397         } else {
5398             mb_type -= 5;
5399             goto decode_intra_mb;
5400         }
5401     } else {
5402         if(h->slice_type == FF_SI_TYPE && mb_type)
5403             mb_type--;
5404         assert(h->slice_type_nos == FF_I_TYPE);
5405 decode_intra_mb:
5406         partition_count = 0;
5407         cbp= i_mb_type_info[mb_type].cbp;
5408         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5409         mb_type= i_mb_type_info[mb_type].type;
5410     }
5411     if(MB_FIELD)
5412         mb_type |= MB_TYPE_INTERLACED;
5413
5414     h->slice_table[ mb_xy ]= h->slice_num;
5415
5416     if(IS_INTRA_PCM(mb_type)) {
5417         const uint8_t *ptr;
5418
5419         // We assume these blocks are very rare so we do not optimize it.
5420         // FIXME The two following lines get the bitstream position in the cabac
5421         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5422         ptr= h->cabac.bytestream;
5423         if(h->cabac.low&0x1) ptr--;
5424         if(CABAC_BITS==16){
5425             if(h->cabac.low&0x1FF) ptr--;
5426         }
5427
5428         // The pixels are stored in the same order as levels in h->mb array.
5429         memcpy(h->mb, ptr, 256); ptr+=256;
5430         if(CHROMA){
5431             memcpy(h->mb+128, ptr, 128); ptr+=128;
5432         }
5433
5434         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5435
5436         // All blocks are present
5437         h->cbp_table[mb_xy] = 0x1ef;
5438         h->chroma_pred_mode_table[mb_xy] = 0;
5439         // In deblocking, the quantizer is 0
5440         s->current_picture.qscale_table[mb_xy]= 0;
5441         // All coeffs are present
5442         memset(h->non_zero_count[mb_xy], 16, 16);
5443         s->current_picture.mb_type[mb_xy]= mb_type;
5444         h->last_qscale_diff = 0;
5445         return 0;
5446     }
5447
5448     if(MB_MBAFF){
5449         h->ref_count[0] <<= 1;
5450         h->ref_count[1] <<= 1;
5451     }
5452
5453     fill_caches(h, mb_type, 0);
5454
5455     if( IS_INTRA( mb_type ) ) {
5456         int i, pred_mode;
5457         if( IS_INTRA4x4( mb_type ) ) {
5458             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5459                 mb_type |= MB_TYPE_8x8DCT;
5460                 for( i = 0; i < 16; i+=4 ) {
5461                     int pred = pred_intra_mode( h, i );
5462                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5463                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5464                 }
5465             } else {
5466                 for( i = 0; i < 16; i++ ) {
5467                     int pred = pred_intra_mode( h, i );
5468                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5469
5470                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5471                 }
5472             }
5473             write_back_intra_pred_mode(h);
5474             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5475         } else {
5476             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5477             if( h->intra16x16_pred_mode < 0 ) return -1;
5478         }
5479         if(CHROMA){
5480             h->chroma_pred_mode_table[mb_xy] =
5481             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5482
5483             pred_mode= check_intra_pred_mode( h, pred_mode );
5484             if( pred_mode < 0 ) return -1;
5485             h->chroma_pred_mode= pred_mode;
5486         }
5487     } else if( partition_count == 4 ) {
5488         int i, j, sub_partition_count[4], list, ref[2][4];
5489
5490         if( h->slice_type_nos == FF_B_TYPE ) {
5491             for( i = 0; i < 4; i++ ) {
5492                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5493                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5494                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5495             }
5496             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5497                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5498                 pred_direct_motion(h, &mb_type);
5499                 h->ref_cache[0][scan8[4]] =
5500                 h->ref_cache[1][scan8[4]] =
5501                 h->ref_cache[0][scan8[12]] =
5502                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5503                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5504                     for( i = 0; i < 4; i++ )
5505                         if( IS_DIRECT(h->sub_mb_type[i]) )
5506                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5507                 }
5508             }
5509         } else {
5510             for( i = 0; i < 4; i++ ) {
5511                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5512                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5513                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5514             }
5515         }
5516
5517         for( list = 0; list < h->list_count; list++ ) {
5518                 for( i = 0; i < 4; i++ ) {
5519                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5520                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5521                         if( h->ref_count[list] > 1 )
5522                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5523                         else
5524                             ref[list][i] = 0;
5525                     } else {
5526                         ref[list][i] = -1;
5527                     }
5528                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5529                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5530                 }
5531         }
5532
5533         if(dct8x8_allowed)
5534             dct8x8_allowed = get_dct8x8_allowed(h);
5535
5536         for(list=0; list<h->list_count; list++){
5537             for(i=0; i<4; i++){
5538                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5539                 if(IS_DIRECT(h->sub_mb_type[i])){
5540                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5541                     continue;
5542                 }
5543
5544                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5545                     const int sub_mb_type= h->sub_mb_type[i];
5546                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5547                     for(j=0; j<sub_partition_count[i]; j++){
5548                         int mpx, mpy;
5549                         int mx, my;
5550                         const int index= 4*i + block_width*j;
5551                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5552                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5553                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5554
5555                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5556                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5557                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5558
5559                         if(IS_SUB_8X8(sub_mb_type)){
5560                             mv_cache[ 1 ][0]=
5561                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5562                             mv_cache[ 1 ][1]=
5563                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5564
5565                             mvd_cache[ 1 ][0]=
5566                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5567                             mvd_cache[ 1 ][1]=
5568                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5569                         }else if(IS_SUB_8X4(sub_mb_type)){
5570                             mv_cache[ 1 ][0]= mx;
5571                             mv_cache[ 1 ][1]= my;
5572
5573                             mvd_cache[ 1 ][0]= mx - mpx;
5574                             mvd_cache[ 1 ][1]= my - mpy;
5575                         }else if(IS_SUB_4X8(sub_mb_type)){
5576                             mv_cache[ 8 ][0]= mx;
5577                             mv_cache[ 8 ][1]= my;
5578
5579                             mvd_cache[ 8 ][0]= mx - mpx;
5580                             mvd_cache[ 8 ][1]= my - mpy;
5581                         }
5582                         mv_cache[ 0 ][0]= mx;
5583                         mv_cache[ 0 ][1]= my;
5584
5585                         mvd_cache[ 0 ][0]= mx - mpx;
5586                         mvd_cache[ 0 ][1]= my - mpy;
5587                     }
5588                 }else{
5589                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5590                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5591                     p[0] = p[1] = p[8] = p[9] = 0;
5592                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5593                 }
5594             }
5595         }
5596     } else if( IS_DIRECT(mb_type) ) {
5597         pred_direct_motion(h, &mb_type);
5598         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5599         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5600         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5601     } else {
5602         int list, mx, my, i, mpx, mpy;
5603         if(IS_16X16(mb_type)){
5604             for(list=0; list<h->list_count; list++){
5605                 if(IS_DIR(mb_type, 0, list)){
5606                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5607                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5608                 }else
5609                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5610             }
5611             for(list=0; list<h->list_count; list++){
5612                 if(IS_DIR(mb_type, 0, list)){
5613                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5614
5615                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5616                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5617                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5618
5619                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5620                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5621                 }else
5622                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5623             }
5624         }
5625         else if(IS_16X8(mb_type)){
5626             for(list=0; list<h->list_count; list++){
5627                     for(i=0; i<2; i++){
5628                         if(IS_DIR(mb_type, i, list)){
5629                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5630                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5631                         }else
5632                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5633                     }
5634             }
5635             for(list=0; list<h->list_count; list++){
5636                 for(i=0; i<2; i++){
5637                     if(IS_DIR(mb_type, i, list)){
5638                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5639                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5640                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5641                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5642
5643                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5644                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5645                     }else{
5646                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5647                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5648                     }
5649                 }
5650             }
5651         }else{
5652             assert(IS_8X16(mb_type));
5653             for(list=0; list<h->list_count; list++){
5654                     for(i=0; i<2; i++){
5655                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5656                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5657                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5658                         }else
5659                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5660                     }
5661             }
5662             for(list=0; list<h->list_count; list++){
5663                 for(i=0; i<2; i++){
5664                     if(IS_DIR(mb_type, i, list)){
5665                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5666                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5667                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5668
5669                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5670                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5671                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5672                     }else{
5673                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5674                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5675                     }
5676                 }
5677             }
5678         }
5679     }
5680
5681    if( IS_INTER( mb_type ) ) {
5682         h->chroma_pred_mode_table[mb_xy] = 0;
5683         write_back_motion( h, mb_type );
5684    }
5685
5686     if( !IS_INTRA16x16( mb_type ) ) {
5687         cbp  = decode_cabac_mb_cbp_luma( h );
5688         if(CHROMA)
5689             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5690     }
5691
5692     h->cbp_table[mb_xy] = h->cbp = cbp;
5693
5694     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5695         if( decode_cabac_mb_transform_size( h ) )
5696             mb_type |= MB_TYPE_8x8DCT;
5697     }
5698     s->current_picture.mb_type[mb_xy]= mb_type;
5699
5700     if( cbp || IS_INTRA16x16( mb_type ) ) {
5701         const uint8_t *scan, *scan8x8, *dc_scan;
5702         const uint32_t *qmul;
5703         int dqp;
5704
5705         if(IS_INTERLACED(mb_type)){
5706             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5707             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5708             dc_scan= luma_dc_field_scan;
5709         }else{
5710             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5711             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5712             dc_scan= luma_dc_zigzag_scan;
5713         }
5714
5715         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5716         if( dqp == INT_MIN ){
5717             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5718             return -1;
5719         }
5720         s->qscale += dqp;
5721         if(((unsigned)s->qscale) > 51){
5722             if(s->qscale<0) s->qscale+= 52;
5723             else            s->qscale-= 52;
5724         }
5725         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5726         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5727
5728         if( IS_INTRA16x16( mb_type ) ) {
5729             int i;
5730             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5731             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5732
5733             if( cbp&15 ) {
5734                 qmul = h->dequant4_coeff[0][s->qscale];
5735                 for( i = 0; i < 16; i++ ) {
5736                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5737                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5738                 }
5739             } else {
5740                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5741             }
5742         } else {
5743             int i8x8, i4x4;
5744             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5745                 if( cbp & (1<<i8x8) ) {
5746                     if( IS_8x8DCT(mb_type) ) {
5747                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5748                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5749                     } else {
5750                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5751                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5752                             const int index = 4*i8x8 + i4x4;
5753                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5754 //START_TIMER
5755                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5756 //STOP_TIMER("decode_residual")
5757                         }
5758                     }
5759                 } else {
5760                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5761                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5762                 }
5763             }
5764         }
5765
5766         if( cbp&0x30 ){
5767             int c;
5768             for( c = 0; c < 2; c++ ) {
5769                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5770                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5771             }
5772         }
5773
5774         if( cbp&0x20 ) {
5775             int c, i;
5776             for( c = 0; c < 2; c++ ) {
5777                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5778                 for( i = 0; i < 4; i++ ) {
5779                     const int index = 16 + 4 * c + i;
5780                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5781                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5782                 }
5783             }
5784         } else {
5785             uint8_t * const nnz= &h->non_zero_count_cache[0];
5786             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5787             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5788         }
5789     } else {
5790         uint8_t * const nnz= &h->non_zero_count_cache[0];
5791         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5792         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5793         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5794         h->last_qscale_diff = 0;
5795     }
5796
5797     s->current_picture.qscale_table[mb_xy]= s->qscale;
5798     write_back_non_zero_count(h);
5799
5800     if(MB_MBAFF){
5801         h->ref_count[0] >>= 1;
5802         h->ref_count[1] >>= 1;
5803     }
5804
5805     return 0;
5806 }
5807
5808
5809 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5810     int i, d;
5811     const int index_a = qp + h->slice_alpha_c0_offset;
5812     const int alpha = (alpha_table+52)[index_a];
5813     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5814
5815     if( bS[0] < 4 ) {
5816         int8_t tc[4];
5817         for(i=0; i<4; i++)
5818             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5819         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5820     } else {
5821         /* 16px edge length, because bS=4 is triggered by being at
5822          * the edge of an intra MB, so all 4 bS are the same */
5823             for( d = 0; d < 16; d++ ) {
5824                 const int p0 = pix[-1];
5825                 const int p1 = pix[-2];
5826                 const int p2 = pix[-3];
5827
5828                 const int q0 = pix[0];
5829                 const int q1 = pix[1];
5830                 const int q2 = pix[2];
5831
5832                 if( FFABS( p0 - q0 ) < alpha &&
5833                     FFABS( p1 - p0 ) < beta &&
5834                     FFABS( q1 - q0 ) < beta ) {
5835
5836                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5837                         if( FFABS( p2 - p0 ) < beta)
5838                         {
5839                             const int p3 = pix[-4];
5840                             /* p0', p1', p2' */
5841                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5842                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5843                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5844                         } else {
5845                             /* p0' */
5846                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5847                         }
5848                         if( FFABS( q2 - q0 ) < beta)
5849                         {
5850                             const int q3 = pix[3];
5851                             /* q0', q1', q2' */
5852                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5853                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5854                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5855                         } else {
5856                             /* q0' */
5857                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5858                         }
5859                     }else{
5860                         /* p0', q0' */
5861                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5862                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5863                     }
5864                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5865                 }
5866                 pix += stride;
5867             }
5868     }
5869 }
5870 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5871     int i;
5872     const int index_a = qp + h->slice_alpha_c0_offset;
5873     const int alpha = (alpha_table+52)[index_a];
5874     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5875
5876     if( bS[0] < 4 ) {
5877         int8_t tc[4];
5878         for(i=0; i<4; i++)
5879             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5880         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5881     } else {
5882         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5883     }
5884 }
5885
5886 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5887     int i;
5888     for( i = 0; i < 16; i++, pix += stride) {
5889         int index_a;
5890         int alpha;
5891         int beta;
5892
5893         int qp_index;
5894         int bS_index = (i >> 1);
5895         if (!MB_FIELD) {
5896             bS_index &= ~1;
5897             bS_index |= (i & 1);
5898         }
5899
5900         if( bS[bS_index] == 0 ) {
5901             continue;
5902         }
5903
5904         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5905         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5906         alpha = (alpha_table+52)[index_a];
5907         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5908
5909         if( bS[bS_index] < 4 ) {
5910             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5911             const int p0 = pix[-1];
5912             const int p1 = pix[-2];
5913             const int p2 = pix[-3];
5914             const int q0 = pix[0];
5915             const int q1 = pix[1];
5916             const int q2 = pix[2];
5917
5918             if( FFABS( p0 - q0 ) < alpha &&
5919                 FFABS( p1 - p0 ) < beta &&
5920                 FFABS( q1 - q0 ) < beta ) {
5921                 int tc = tc0;
5922                 int i_delta;
5923
5924                 if( FFABS( p2 - p0 ) < beta ) {
5925                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5926                     tc++;
5927                 }
5928                 if( FFABS( q2 - q0 ) < beta ) {
5929                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5930                     tc++;
5931                 }
5932
5933                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5934                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5935                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5936                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5937             }
5938         }else{
5939             const int p0 = pix[-1];
5940             const int p1 = pix[-2];
5941             const int p2 = pix[-3];
5942
5943             const int q0 = pix[0];
5944             const int q1 = pix[1];
5945             const int q2 = pix[2];
5946
5947             if( FFABS( p0 - q0 ) < alpha &&
5948                 FFABS( p1 - p0 ) < beta &&
5949                 FFABS( q1 - q0 ) < beta ) {
5950
5951                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5952                     if( FFABS( p2 - p0 ) < beta)
5953                     {
5954                         const int p3 = pix[-4];
5955                         /* p0', p1', p2' */
5956                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5957                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5958                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5959                     } else {
5960                         /* p0' */
5961                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5962                     }
5963                     if( FFABS( q2 - q0 ) < beta)
5964                     {
5965                         const int q3 = pix[3];
5966                         /* q0', q1', q2' */
5967                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5968                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5969                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5970                     } else {
5971                         /* q0' */
5972                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5973                     }
5974                 }else{
5975                     /* p0', q0' */
5976                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5977                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5978                 }
5979                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5980             }
5981         }
5982     }
5983 }
5984 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5985     int i;
5986     for( i = 0; i < 8; i++, pix += stride) {
5987         int index_a;
5988         int alpha;
5989         int beta;
5990
5991         int qp_index;
5992         int bS_index = i;
5993
5994         if( bS[bS_index] == 0 ) {
5995             continue;
5996         }
5997
5998         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
5999         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6000         alpha = (alpha_table+52)[index_a];
6001         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6002
6003         if( bS[bS_index] < 4 ) {
6004             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6005             const int p0 = pix[-1];
6006             const int p1 = pix[-2];
6007             const int q0 = pix[0];
6008             const int q1 = pix[1];
6009
6010             if( FFABS( p0 - q0 ) < alpha &&
6011                 FFABS( p1 - p0 ) < beta &&
6012                 FFABS( q1 - q0 ) < beta ) {
6013                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6014
6015                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6016                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6017                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6018             }
6019         }else{
6020             const int p0 = pix[-1];
6021             const int p1 = pix[-2];
6022             const int q0 = pix[0];
6023             const int q1 = pix[1];
6024
6025             if( FFABS( p0 - q0 ) < alpha &&
6026                 FFABS( p1 - p0 ) < beta &&
6027                 FFABS( q1 - q0 ) < beta ) {
6028
6029                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6030                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6031                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6032             }
6033         }
6034     }
6035 }
6036
6037 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6038     int i, d;
6039     const int index_a = qp + h->slice_alpha_c0_offset;
6040     const int alpha = (alpha_table+52)[index_a];
6041     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6042     const int pix_next  = stride;
6043
6044     if( bS[0] < 4 ) {
6045         int8_t tc[4];
6046         for(i=0; i<4; i++)
6047             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6048         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6049     } else {
6050         /* 16px edge length, see filter_mb_edgev */
6051             for( d = 0; d < 16; d++ ) {
6052                 const int p0 = pix[-1*pix_next];
6053                 const int p1 = pix[-2*pix_next];
6054                 const int p2 = pix[-3*pix_next];
6055                 const int q0 = pix[0];
6056                 const int q1 = pix[1*pix_next];
6057                 const int q2 = pix[2*pix_next];
6058
6059                 if( FFABS( p0 - q0 ) < alpha &&
6060                     FFABS( p1 - p0 ) < beta &&
6061                     FFABS( q1 - q0 ) < beta ) {
6062
6063                     const int p3 = pix[-4*pix_next];
6064                     const int q3 = pix[ 3*pix_next];
6065
6066                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6067                         if( FFABS( p2 - p0 ) < beta) {
6068                             /* p0', p1', p2' */
6069                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6070                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6071                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6072                         } else {
6073                             /* p0' */
6074                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6075                         }
6076                         if( FFABS( q2 - q0 ) < beta) {
6077                             /* q0', q1', q2' */
6078                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6079                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6080                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6081                         } else {
6082                             /* q0' */
6083                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6084                         }
6085                     }else{
6086                         /* p0', q0' */
6087                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6088                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6089                     }
6090                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6091                 }
6092                 pix++;
6093             }
6094     }
6095 }
6096
6097 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6098     int i;
6099     const int index_a = qp + h->slice_alpha_c0_offset;
6100     const int alpha = (alpha_table+52)[index_a];
6101     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6102
6103     if( bS[0] < 4 ) {
6104         int8_t tc[4];
6105         for(i=0; i<4; i++)
6106             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6107         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6108     } else {
6109         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6110     }
6111 }
6112
6113 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6114     MpegEncContext * const s = &h->s;
6115     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6116     int mb_xy, mb_type;
6117     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6118
6119     mb_xy = h->mb_xy;
6120
6121     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6122 1 ||
6123        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6124                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6125         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6126         return;
6127     }
6128     assert(!FRAME_MBAFF);
6129
6130     mb_type = s->current_picture.mb_type[mb_xy];
6131     qp = s->current_picture.qscale_table[mb_xy];
6132     qp0 = s->current_picture.qscale_table[mb_xy-1];
6133     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6134     qpc = get_chroma_qp( h, 0, qp );
6135     qpc0 = get_chroma_qp( h, 0, qp0 );
6136     qpc1 = get_chroma_qp( h, 0, qp1 );
6137     qp0 = (qp + qp0 + 1) >> 1;
6138     qp1 = (qp + qp1 + 1) >> 1;
6139     qpc0 = (qpc + qpc0 + 1) >> 1;
6140     qpc1 = (qpc + qpc1 + 1) >> 1;
6141     qp_thresh = 15 - h->slice_alpha_c0_offset;
6142     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6143        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6144         return;
6145
6146     if( IS_INTRA(mb_type) ) {
6147         int16_t bS4[4] = {4,4,4,4};
6148         int16_t bS3[4] = {3,3,3,3};
6149         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6150         if( IS_8x8DCT(mb_type) ) {
6151             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6152             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6153             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6154             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6155         } else {
6156             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6157             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6158             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6159             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6160             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6161             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6162             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6163             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6164         }
6165         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6166         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6167         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6168         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6169         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6170         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6171         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6172         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6173         return;
6174     } else {
6175         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6176         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6177         int edges;
6178         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6179             edges = 4;
6180             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6181         } else {
6182             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6183                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6184             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6185                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6186                              ? 3 : 0;
6187             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6188             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6189             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6190                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6191         }
6192         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6193             bSv[0][0] = 0x0004000400040004ULL;
6194         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6195             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6196
6197 #define FILTER(hv,dir,edge)\
6198         if(bSv[dir][edge]) {\
6199             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6200             if(!(edge&1)) {\
6201                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6202                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6203             }\
6204         }
6205         if( edges == 1 ) {
6206             FILTER(v,0,0);
6207             FILTER(h,1,0);
6208         } else if( IS_8x8DCT(mb_type) ) {
6209             FILTER(v,0,0);
6210             FILTER(v,0,2);
6211             FILTER(h,1,0);
6212             FILTER(h,1,2);
6213         } else {
6214             FILTER(v,0,0);
6215             FILTER(v,0,1);
6216             FILTER(v,0,2);
6217             FILTER(v,0,3);
6218             FILTER(h,1,0);
6219             FILTER(h,1,1);
6220             FILTER(h,1,2);
6221             FILTER(h,1,3);
6222         }
6223 #undef FILTER
6224     }
6225 }
6226
6227 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6228     MpegEncContext * const s = &h->s;
6229     const int mb_xy= mb_x + mb_y*s->mb_stride;
6230     const int mb_type = s->current_picture.mb_type[mb_xy];
6231     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6232     int first_vertical_edge_done = 0;
6233     int dir;
6234
6235     //for sufficiently low qp, filtering wouldn't do anything
6236     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6237     if(!FRAME_MBAFF){
6238         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6239         int qp = s->current_picture.qscale_table[mb_xy];
6240         if(qp <= qp_thresh
6241            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6242            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6243             return;
6244         }
6245     }
6246
6247     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6248     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6249         int top_type, left_type[2];
6250         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6251         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6252         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6253
6254         if(IS_8x8DCT(top_type)){
6255             h->non_zero_count_cache[4+8*0]=
6256             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6257             h->non_zero_count_cache[6+8*0]=
6258             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6259         }
6260         if(IS_8x8DCT(left_type[0])){
6261             h->non_zero_count_cache[3+8*1]=
6262             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6263         }
6264         if(IS_8x8DCT(left_type[1])){
6265             h->non_zero_count_cache[3+8*3]=
6266             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6267         }
6268
6269         if(IS_8x8DCT(mb_type)){
6270             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6271             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6272
6273             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6274             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6275
6276             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6277             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6278
6279             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6280             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6281         }
6282     }
6283
6284     if (FRAME_MBAFF
6285             // left mb is in picture
6286             && h->slice_table[mb_xy-1] != 0xFFFF
6287             // and current and left pair do not have the same interlaced type
6288             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6289             // and left mb is in the same slice if deblocking_filter == 2
6290             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6291         /* First vertical edge is different in MBAFF frames
6292          * There are 8 different bS to compute and 2 different Qp
6293          */
6294         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6295         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6296         int16_t bS[8];
6297         int qp[2];
6298         int bqp[2];
6299         int rqp[2];
6300         int mb_qp, mbn0_qp, mbn1_qp;
6301         int i;
6302         first_vertical_edge_done = 1;
6303
6304         if( IS_INTRA(mb_type) )
6305             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6306         else {
6307             for( i = 0; i < 8; i++ ) {
6308                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6309
6310                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6311                     bS[i] = 4;
6312                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6313                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6314                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6315                                                                        :
6316                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6317                     bS[i] = 2;
6318                 else
6319                     bS[i] = 1;
6320             }
6321         }
6322
6323         mb_qp = s->current_picture.qscale_table[mb_xy];
6324         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6325         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6326         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6327         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6328                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6329         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6330                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6331         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6332         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6333                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6334         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6335                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6336
6337         /* Filter edge */
6338         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6339         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6340         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6341         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6342         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6343     }
6344     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6345     for( dir = 0; dir < 2; dir++ )
6346     {
6347         int edge;
6348         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6349         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6350         int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6351         int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6352         int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6353
6354         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6355                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6356         // how often to recheck mv-based bS when iterating between edges
6357         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6358                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6359         // how often to recheck mv-based bS when iterating along each edge
6360         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6361
6362         if (first_vertical_edge_done) {
6363             start = 1;
6364             first_vertical_edge_done = 0;
6365         }
6366
6367         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6368             start = 1;
6369
6370         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6371             && !IS_INTERLACED(mb_type)
6372             && IS_INTERLACED(mbm_type)
6373             ) {
6374             // This is a special case in the norm where the filtering must
6375             // be done twice (one each of the field) even if we are in a
6376             // frame macroblock.
6377             //
6378             static const int nnz_idx[4] = {4,5,6,3};
6379             unsigned int tmp_linesize   = 2 *   linesize;
6380             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6381             int mbn_xy = mb_xy - 2 * s->mb_stride;
6382             int qp;
6383             int i, j;
6384             int16_t bS[4];
6385
6386             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6387                 if( IS_INTRA(mb_type) ||
6388                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6389                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6390                 } else {
6391                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6392                     for( i = 0; i < 4; i++ ) {
6393                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6394                             mbn_nnz[nnz_idx[i]] != 0 )
6395                             bS[i] = 2;
6396                         else
6397                             bS[i] = 1;
6398                     }
6399                 }
6400                 // Do not use s->qscale as luma quantizer because it has not the same
6401                 // value in IPCM macroblocks.
6402                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6403                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6404                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6405                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6406                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6407                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6408                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6409                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6410             }
6411
6412             start = 1;
6413         }
6414
6415         /* Calculate bS */
6416         for( edge = start; edge < edges; edge++ ) {
6417             /* mbn_xy: neighbor macroblock */
6418             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6419             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6420             int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6421             int16_t bS[4];
6422             int qp;
6423
6424             if( (edge&1) && IS_8x8DCT(mb_type) )
6425                 continue;
6426
6427             if( IS_INTRA(mb_type) ||
6428                 IS_INTRA(mbn_type) ) {
6429                 int value;
6430                 if (edge == 0) {
6431                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6432                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6433                     ) {
6434                         value = 4;
6435                     } else {
6436                         value = 3;
6437                     }
6438                 } else {
6439                     value = 3;
6440                 }
6441                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6442             } else {
6443                 int i, l;
6444                 int mv_done;
6445
6446                 if( edge & mask_edge ) {
6447                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6448                     mv_done = 1;
6449                 }
6450                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6451                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6452                     mv_done = 1;
6453                 }
6454                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6455                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6456                     int bn_idx= b_idx - (dir ? 8:1);
6457                     int v = 0;
6458
6459                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6460                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6461                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6462                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6463                     }
6464
6465                     if(h->slice_type_nos == FF_B_TYPE && v){
6466                         v=0;
6467                         for( l = 0; !v && l < 2; l++ ) {
6468                             int ln= 1-l;
6469                             v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6470                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6471                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6472                         }
6473                     }
6474
6475                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6476                     mv_done = 1;
6477                 }
6478                 else
6479                     mv_done = 0;
6480
6481                 for( i = 0; i < 4; i++ ) {
6482                     int x = dir == 0 ? edge : i;
6483                     int y = dir == 0 ? i    : edge;
6484                     int b_idx= 8 + 4 + x + 8*y;
6485                     int bn_idx= b_idx - (dir ? 8:1);
6486
6487                     if( h->non_zero_count_cache[b_idx] != 0 ||
6488                         h->non_zero_count_cache[bn_idx] != 0 ) {
6489                         bS[i] = 2;
6490                     }
6491                     else if(!mv_done)
6492                     {
6493                         bS[i] = 0;
6494                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6495                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6496                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6497                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6498                                 bS[i] = 1;
6499                                 break;
6500                             }
6501                         }
6502
6503                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6504                             bS[i] = 0;
6505                             for( l = 0; l < 2; l++ ) {
6506                                 int ln= 1-l;
6507                                 if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6508                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6509                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6510                                     bS[i] = 1;
6511                                     break;
6512                                 }
6513                             }
6514                         }
6515                     }
6516                 }
6517
6518                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6519                     continue;
6520             }
6521
6522             /* Filter edge */
6523             // Do not use s->qscale as luma quantizer because it has not the same
6524             // value in IPCM macroblocks.
6525             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6526             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6527             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6528             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6529             if( dir == 0 ) {
6530                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6531                 if( (edge&1) == 0 ) {
6532                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6533                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6534                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6535                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6536                 }
6537             } else {
6538                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6539                 if( (edge&1) == 0 ) {
6540                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6541                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6542                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6543                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6544                 }
6545             }
6546         }
6547     }
6548 }
6549
6550 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6551     H264Context *h = *(void**)arg;
6552     MpegEncContext * const s = &h->s;
6553     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6554
6555     s->mb_skip_run= -1;
6556
6557     if( h->pps.cabac ) {
6558         int i;
6559
6560         /* realign */
6561         align_get_bits( &s->gb );
6562
6563         /* init cabac */
6564         ff_init_cabac_states( &h->cabac);
6565         ff_init_cabac_decoder( &h->cabac,
6566                                s->gb.buffer + get_bits_count(&s->gb)/8,
6567                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6568         /* calculate pre-state */
6569         for( i= 0; i < 460; i++ ) {
6570             int pre;
6571             if( h->slice_type_nos == FF_I_TYPE )
6572                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6573             else
6574                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6575
6576             if( pre <= 63 )
6577                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6578             else
6579                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6580         }
6581
6582         for(;;){
6583 //START_TIMER
6584             int ret = decode_mb_cabac(h);
6585             int eos;
6586 //STOP_TIMER("decode_mb_cabac")
6587
6588             if(ret>=0) hl_decode_mb(h);
6589
6590             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6591                 s->mb_y++;
6592
6593                 if(ret>=0) ret = decode_mb_cabac(h);
6594
6595                 if(ret>=0) hl_decode_mb(h);
6596                 s->mb_y--;
6597             }
6598             eos = get_cabac_terminate( &h->cabac );
6599
6600             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6601                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6602                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6603                 return -1;
6604             }
6605
6606             if( ++s->mb_x >= s->mb_width ) {
6607                 s->mb_x = 0;
6608                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6609                 ++s->mb_y;
6610                 if(FIELD_OR_MBAFF_PICTURE) {
6611                     ++s->mb_y;
6612                 }
6613             }
6614
6615             if( eos || s->mb_y >= s->mb_height ) {
6616                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6617                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6618                 return 0;
6619             }
6620         }
6621
6622     } else {
6623         for(;;){
6624             int ret = decode_mb_cavlc(h);
6625
6626             if(ret>=0) hl_decode_mb(h);
6627
6628             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6629                 s->mb_y++;
6630                 ret = decode_mb_cavlc(h);
6631
6632                 if(ret>=0) hl_decode_mb(h);
6633                 s->mb_y--;
6634             }
6635
6636             if(ret<0){
6637                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6638                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6639
6640                 return -1;
6641             }
6642
6643             if(++s->mb_x >= s->mb_width){
6644                 s->mb_x=0;
6645                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6646                 ++s->mb_y;
6647                 if(FIELD_OR_MBAFF_PICTURE) {
6648                     ++s->mb_y;
6649                 }
6650                 if(s->mb_y >= s->mb_height){
6651                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6652
6653                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6654                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6655
6656                         return 0;
6657                     }else{
6658                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6659
6660                         return -1;
6661                     }
6662                 }
6663             }
6664
6665             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6666                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6667                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6668                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6669
6670                     return 0;
6671                 }else{
6672                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6673
6674                     return -1;
6675                 }
6676             }
6677         }
6678     }
6679
6680 #if 0
6681     for(;s->mb_y < s->mb_height; s->mb_y++){
6682         for(;s->mb_x < s->mb_width; s->mb_x++){
6683             int ret= decode_mb(h);
6684
6685             hl_decode_mb(h);
6686
6687             if(ret<0){
6688                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6689                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6690
6691                 return -1;
6692             }
6693
6694             if(++s->mb_x >= s->mb_width){
6695                 s->mb_x=0;
6696                 if(++s->mb_y >= s->mb_height){
6697                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6698                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6699
6700                         return 0;
6701                     }else{
6702                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6703
6704                         return -1;
6705                     }
6706                 }
6707             }
6708
6709             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6710                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6711                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6712
6713                     return 0;
6714                 }else{
6715                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6716
6717                     return -1;
6718                 }
6719             }
6720         }
6721         s->mb_x=0;
6722         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6723     }
6724 #endif
6725     return -1; //not reached
6726 }
6727
6728 static int decode_picture_timing(H264Context *h){
6729     MpegEncContext * const s = &h->s;
6730     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6731         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6732         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6733     }
6734     if(h->sps.pic_struct_present_flag){
6735         unsigned int i, num_clock_ts;
6736         h->sei_pic_struct = get_bits(&s->gb, 4);
6737
6738         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6739             return -1;
6740
6741         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6742
6743         for (i = 0 ; i < num_clock_ts ; i++){
6744             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6745                 unsigned int full_timestamp_flag;
6746                 skip_bits(&s->gb, 2);                 /* ct_type */
6747                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6748                 skip_bits(&s->gb, 5);                 /* counting_type */
6749                 full_timestamp_flag = get_bits(&s->gb, 1);
6750                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6751                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6752                 skip_bits(&s->gb, 8);                 /* n_frames */
6753                 if(full_timestamp_flag){
6754                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6755                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6756                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6757                 }else{
6758                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6759                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6760                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6761                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6762                             if(get_bits(&s->gb, 1))   /* hours_flag */
6763                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6764                         }
6765                     }
6766                 }
6767                 if(h->sps.time_offset_length > 0)
6768                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6769             }
6770         }
6771     }
6772     return 0;
6773 }
6774
6775 static int decode_unregistered_user_data(H264Context *h, int size){
6776     MpegEncContext * const s = &h->s;
6777     uint8_t user_data[16+256];
6778     int e, build, i;
6779
6780     if(size<16)
6781         return -1;
6782
6783     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6784         user_data[i]= get_bits(&s->gb, 8);
6785     }
6786
6787     user_data[i]= 0;
6788     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6789     if(e==1 && build>=0)
6790         h->x264_build= build;
6791
6792     if(s->avctx->debug & FF_DEBUG_BUGS)
6793         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6794
6795     for(; i<size; i++)
6796         skip_bits(&s->gb, 8);
6797
6798     return 0;
6799 }
6800
6801 static int decode_sei(H264Context *h){
6802     MpegEncContext * const s = &h->s;
6803
6804     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6805         int size, type;
6806
6807         type=0;
6808         do{
6809             type+= show_bits(&s->gb, 8);
6810         }while(get_bits(&s->gb, 8) == 255);
6811
6812         size=0;
6813         do{
6814             size+= show_bits(&s->gb, 8);
6815         }while(get_bits(&s->gb, 8) == 255);
6816
6817         switch(type){
6818         case 1: // Picture timing SEI
6819             if(decode_picture_timing(h) < 0)
6820                 return -1;
6821             break;
6822         case 5:
6823             if(decode_unregistered_user_data(h, size) < 0)
6824                 return -1;
6825             break;
6826         default:
6827             skip_bits(&s->gb, 8*size);
6828         }
6829
6830         //FIXME check bits here
6831         align_get_bits(&s->gb);
6832     }
6833
6834     return 0;
6835 }
6836
6837 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6838     MpegEncContext * const s = &h->s;
6839     int cpb_count, i;
6840     cpb_count = get_ue_golomb(&s->gb) + 1;
6841
6842     if(cpb_count > 32U){
6843         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6844         return -1;
6845     }
6846
6847     get_bits(&s->gb, 4); /* bit_rate_scale */
6848     get_bits(&s->gb, 4); /* cpb_size_scale */
6849     for(i=0; i<cpb_count; i++){
6850         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6851         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6852         get_bits1(&s->gb);     /* cbr_flag */
6853     }
6854     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6855     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6856     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6857     sps->time_offset_length = get_bits(&s->gb, 5);
6858     return 0;
6859 }
6860
6861 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6862     MpegEncContext * const s = &h->s;
6863     int aspect_ratio_info_present_flag;
6864     unsigned int aspect_ratio_idc;
6865
6866     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6867
6868     if( aspect_ratio_info_present_flag ) {
6869         aspect_ratio_idc= get_bits(&s->gb, 8);
6870         if( aspect_ratio_idc == EXTENDED_SAR ) {
6871             sps->sar.num= get_bits(&s->gb, 16);
6872             sps->sar.den= get_bits(&s->gb, 16);
6873         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6874             sps->sar=  pixel_aspect[aspect_ratio_idc];
6875         }else{
6876             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6877             return -1;
6878         }
6879     }else{
6880         sps->sar.num=
6881         sps->sar.den= 0;
6882     }
6883 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6884
6885     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6886         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6887     }
6888
6889     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6890         get_bits(&s->gb, 3);    /* video_format */
6891         get_bits1(&s->gb);      /* video_full_range_flag */
6892         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6893             get_bits(&s->gb, 8); /* colour_primaries */
6894             get_bits(&s->gb, 8); /* transfer_characteristics */
6895             get_bits(&s->gb, 8); /* matrix_coefficients */
6896         }
6897     }
6898
6899     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6900         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6901         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6902     }
6903
6904     sps->timing_info_present_flag = get_bits1(&s->gb);
6905     if(sps->timing_info_present_flag){
6906         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6907         sps->time_scale = get_bits_long(&s->gb, 32);
6908         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6909     }
6910
6911     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6912     if(sps->nal_hrd_parameters_present_flag)
6913         if(decode_hrd_parameters(h, sps) < 0)
6914             return -1;
6915     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6916     if(sps->vcl_hrd_parameters_present_flag)
6917         if(decode_hrd_parameters(h, sps) < 0)
6918             return -1;
6919     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6920         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6921     sps->pic_struct_present_flag = get_bits1(&s->gb);
6922
6923     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6924     if(sps->bitstream_restriction_flag){
6925         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6926         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6927         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6928         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6929         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6930         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6931         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6932
6933         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6934             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6935             return -1;
6936         }
6937     }
6938
6939     return 0;
6940 }
6941
6942 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6943                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6944     MpegEncContext * const s = &h->s;
6945     int i, last = 8, next = 8;
6946     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6947     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6948         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6949     else
6950     for(i=0;i<size;i++){
6951         if(next)
6952             next = (last + get_se_golomb(&s->gb)) & 0xff;
6953         if(!i && !next){ /* matrix not written, we use the preset one */
6954             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6955             break;
6956         }
6957         last = factors[scan[i]] = next ? next : last;
6958     }
6959 }
6960
6961 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6962                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6963     MpegEncContext * const s = &h->s;
6964     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6965     const uint8_t *fallback[4] = {
6966         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6967         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6968         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6969         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6970     };
6971     if(get_bits1(&s->gb)){
6972         sps->scaling_matrix_present |= is_sps;
6973         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6974         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6975         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6976         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6977         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6978         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6979         if(is_sps || pps->transform_8x8_mode){
6980             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6981             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6982         }
6983     }
6984 }
6985
6986 static inline int decode_seq_parameter_set(H264Context *h){
6987     MpegEncContext * const s = &h->s;
6988     int profile_idc, level_idc;
6989     unsigned int sps_id;
6990     int i;
6991     SPS *sps;
6992
6993     profile_idc= get_bits(&s->gb, 8);
6994     get_bits1(&s->gb);   //constraint_set0_flag
6995     get_bits1(&s->gb);   //constraint_set1_flag
6996     get_bits1(&s->gb);   //constraint_set2_flag
6997     get_bits1(&s->gb);   //constraint_set3_flag
6998     get_bits(&s->gb, 4); // reserved
6999     level_idc= get_bits(&s->gb, 8);
7000     sps_id= get_ue_golomb(&s->gb);
7001
7002     if(sps_id >= MAX_SPS_COUNT) {
7003         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7004         return -1;
7005     }
7006     sps= av_mallocz(sizeof(SPS));
7007     if(sps == NULL)
7008         return -1;
7009
7010     sps->profile_idc= profile_idc;
7011     sps->level_idc= level_idc;
7012
7013     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7014     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7015     sps->scaling_matrix_present = 0;
7016
7017     if(sps->profile_idc >= 100){ //high profile
7018         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7019         if(sps->chroma_format_idc == 3)
7020             get_bits1(&s->gb);  //residual_color_transform_flag
7021         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7022         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7023         sps->transform_bypass = get_bits1(&s->gb);
7024         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7025     }else{
7026         sps->chroma_format_idc= 1;
7027     }
7028
7029     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7030     sps->poc_type= get_ue_golomb(&s->gb);
7031
7032     if(sps->poc_type == 0){ //FIXME #define
7033         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7034     } else if(sps->poc_type == 1){//FIXME #define
7035         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7036         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7037         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7038         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7039
7040         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7041             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7042             goto fail;
7043         }
7044
7045         for(i=0; i<sps->poc_cycle_length; i++)
7046             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7047     }else if(sps->poc_type != 2){
7048         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7049         goto fail;
7050     }
7051
7052     sps->ref_frame_count= get_ue_golomb(&s->gb);
7053     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7054         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7055         goto fail;
7056     }
7057     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7058     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7059     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7060     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7061        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7062         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7063         goto fail;
7064     }
7065
7066     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7067     if(!sps->frame_mbs_only_flag)
7068         sps->mb_aff= get_bits1(&s->gb);
7069     else
7070         sps->mb_aff= 0;
7071
7072     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7073
7074 #ifndef ALLOW_INTERLACE
7075     if(sps->mb_aff)
7076         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7077 #endif
7078     sps->crop= get_bits1(&s->gb);
7079     if(sps->crop){
7080         sps->crop_left  = get_ue_golomb(&s->gb);
7081         sps->crop_right = get_ue_golomb(&s->gb);
7082         sps->crop_top   = get_ue_golomb(&s->gb);
7083         sps->crop_bottom= get_ue_golomb(&s->gb);
7084         if(sps->crop_left || sps->crop_top){
7085             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7086         }
7087         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7088             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7089         }
7090     }else{
7091         sps->crop_left  =
7092         sps->crop_right =
7093         sps->crop_top   =
7094         sps->crop_bottom= 0;
7095     }
7096
7097     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7098     if( sps->vui_parameters_present_flag )
7099         decode_vui_parameters(h, sps);
7100
7101     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7102         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7103                sps_id, sps->profile_idc, sps->level_idc,
7104                sps->poc_type,
7105                sps->ref_frame_count,
7106                sps->mb_width, sps->mb_height,
7107                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7108                sps->direct_8x8_inference_flag ? "8B8" : "",
7109                sps->crop_left, sps->crop_right,
7110                sps->crop_top, sps->crop_bottom,
7111                sps->vui_parameters_present_flag ? "VUI" : "",
7112                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7113                );
7114     }
7115     av_free(h->sps_buffers[sps_id]);
7116     h->sps_buffers[sps_id]= sps;
7117     return 0;
7118 fail:
7119     av_free(sps);
7120     return -1;
7121 }
7122
7123 static void
7124 build_qp_table(PPS *pps, int t, int index)
7125 {
7126     int i;
7127     for(i = 0; i < 52; i++)
7128         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7129 }
7130
7131 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7132     MpegEncContext * const s = &h->s;
7133     unsigned int pps_id= get_ue_golomb(&s->gb);
7134     PPS *pps;
7135
7136     if(pps_id >= MAX_PPS_COUNT) {
7137         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7138         return -1;
7139     }
7140
7141     pps= av_mallocz(sizeof(PPS));
7142     if(pps == NULL)
7143         return -1;
7144     pps->sps_id= get_ue_golomb(&s->gb);
7145     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7146         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7147         goto fail;
7148     }
7149
7150     pps->cabac= get_bits1(&s->gb);
7151     pps->pic_order_present= get_bits1(&s->gb);
7152     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7153     if(pps->slice_group_count > 1 ){
7154         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7155         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7156         switch(pps->mb_slice_group_map_type){
7157         case 0:
7158 #if 0
7159 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7160 |    run_length[ i ]                                |1  |ue(v)   |
7161 #endif
7162             break;
7163         case 2:
7164 #if 0
7165 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7166 |{                                                  |   |        |
7167 |    top_left_mb[ i ]                               |1  |ue(v)   |
7168 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7169 |   }                                               |   |        |
7170 #endif
7171             break;
7172         case 3:
7173         case 4:
7174         case 5:
7175 #if 0
7176 |   slice_group_change_direction_flag               |1  |u(1)    |
7177 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7178 #endif
7179             break;
7180         case 6:
7181 #if 0
7182 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7183 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7184 |)                                                  |   |        |
7185 |    slice_group_id[ i ]                            |1  |u(v)    |
7186 #endif
7187             break;
7188         }
7189     }
7190     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7191     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7192     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7193         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7194         goto fail;
7195     }
7196
7197     pps->weighted_pred= get_bits1(&s->gb);
7198     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7199     pps->init_qp= get_se_golomb(&s->gb) + 26;
7200     pps->init_qs= get_se_golomb(&s->gb) + 26;
7201     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7202     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7203     pps->constrained_intra_pred= get_bits1(&s->gb);
7204     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7205
7206     pps->transform_8x8_mode= 0;
7207     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7208     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7209     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7210
7211     if(get_bits_count(&s->gb) < bit_length){
7212         pps->transform_8x8_mode= get_bits1(&s->gb);
7213         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7214         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7215     } else {
7216         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7217     }
7218
7219     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7220     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7221     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7222         h->pps.chroma_qp_diff= 1;
7223
7224     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7225         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7226                pps_id, pps->sps_id,
7227                pps->cabac ? "CABAC" : "CAVLC",
7228                pps->slice_group_count,
7229                pps->ref_count[0], pps->ref_count[1],
7230                pps->weighted_pred ? "weighted" : "",
7231                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7232                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7233                pps->constrained_intra_pred ? "CONSTR" : "",
7234                pps->redundant_pic_cnt_present ? "REDU" : "",
7235                pps->transform_8x8_mode ? "8x8DCT" : ""
7236                );
7237     }
7238
7239     av_free(h->pps_buffers[pps_id]);
7240     h->pps_buffers[pps_id]= pps;
7241     return 0;
7242 fail:
7243     av_free(pps);
7244     return -1;
7245 }
7246
7247 /**
7248  * Call decode_slice() for each context.
7249  *
7250  * @param h h264 master context
7251  * @param context_count number of contexts to execute
7252  */
7253 static void execute_decode_slices(H264Context *h, int context_count){
7254     MpegEncContext * const s = &h->s;
7255     AVCodecContext * const avctx= s->avctx;
7256     H264Context *hx;
7257     int i;
7258
7259     if(context_count == 1) {
7260         decode_slice(avctx, &h);
7261     } else {
7262         for(i = 1; i < context_count; i++) {
7263             hx = h->thread_context[i];
7264             hx->s.error_recognition = avctx->error_recognition;
7265             hx->s.error_count = 0;
7266         }
7267
7268         avctx->execute(avctx, (void *)decode_slice,
7269                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7270
7271         /* pull back stuff from slices to master context */
7272         hx = h->thread_context[context_count - 1];
7273         s->mb_x = hx->s.mb_x;
7274         s->mb_y = hx->s.mb_y;
7275         s->dropable = hx->s.dropable;
7276         s->picture_structure = hx->s.picture_structure;
7277         for(i = 1; i < context_count; i++)
7278             h->s.error_count += h->thread_context[i]->s.error_count;
7279     }
7280 }
7281
7282
7283 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7284     MpegEncContext * const s = &h->s;
7285     AVCodecContext * const avctx= s->avctx;
7286     int buf_index=0;
7287     H264Context *hx; ///< thread context
7288     int context_count = 0;
7289
7290     h->max_contexts = avctx->thread_count;
7291 #if 0
7292     int i;
7293     for(i=0; i<50; i++){
7294         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7295     }
7296 #endif
7297     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7298         h->current_slice = 0;
7299         if (!s->first_field)
7300             s->current_picture_ptr= NULL;
7301     }
7302
7303     for(;;){
7304         int consumed;
7305         int dst_length;
7306         int bit_length;
7307         const uint8_t *ptr;
7308         int i, nalsize = 0;
7309         int err;
7310
7311         if(h->is_avc) {
7312             if(buf_index >= buf_size) break;
7313             nalsize = 0;
7314             for(i = 0; i < h->nal_length_size; i++)
7315                 nalsize = (nalsize << 8) | buf[buf_index++];
7316             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7317                 if(nalsize == 1){
7318                     buf_index++;
7319                     continue;
7320                 }else{
7321                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7322                     break;
7323                 }
7324             }
7325         } else {
7326             // start code prefix search
7327             for(; buf_index + 3 < buf_size; buf_index++){
7328                 // This should always succeed in the first iteration.
7329                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7330                     break;
7331             }
7332
7333             if(buf_index+3 >= buf_size) break;
7334
7335             buf_index+=3;
7336         }
7337
7338         hx = h->thread_context[context_count];
7339
7340         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7341         if (ptr==NULL || dst_length < 0){
7342             return -1;
7343         }
7344         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7345             dst_length--;
7346         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7347
7348         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7349             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7350         }
7351
7352         if (h->is_avc && (nalsize != consumed)){
7353             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7354             consumed= nalsize;
7355         }
7356
7357         buf_index += consumed;
7358
7359         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7360            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7361             continue;
7362
7363       again:
7364         err = 0;
7365         switch(hx->nal_unit_type){
7366         case NAL_IDR_SLICE:
7367             if (h->nal_unit_type != NAL_IDR_SLICE) {
7368                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7369                 return -1;
7370             }
7371             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7372         case NAL_SLICE:
7373             init_get_bits(&hx->s.gb, ptr, bit_length);
7374             hx->intra_gb_ptr=
7375             hx->inter_gb_ptr= &hx->s.gb;
7376             hx->s.data_partitioning = 0;
7377
7378             if((err = decode_slice_header(hx, h)))
7379                break;
7380
7381             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7382             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7383                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7384                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7385                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7386                && avctx->skip_frame < AVDISCARD_ALL)
7387                 context_count++;
7388             break;
7389         case NAL_DPA:
7390             init_get_bits(&hx->s.gb, ptr, bit_length);
7391             hx->intra_gb_ptr=
7392             hx->inter_gb_ptr= NULL;
7393             hx->s.data_partitioning = 1;
7394
7395             err = decode_slice_header(hx, h);
7396             break;
7397         case NAL_DPB:
7398             init_get_bits(&hx->intra_gb, ptr, bit_length);
7399             hx->intra_gb_ptr= &hx->intra_gb;
7400             break;
7401         case NAL_DPC:
7402             init_get_bits(&hx->inter_gb, ptr, bit_length);
7403             hx->inter_gb_ptr= &hx->inter_gb;
7404
7405             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7406                && s->context_initialized
7407                && s->hurry_up < 5
7408                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7409                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7410                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7411                && avctx->skip_frame < AVDISCARD_ALL)
7412                 context_count++;
7413             break;
7414         case NAL_SEI:
7415             init_get_bits(&s->gb, ptr, bit_length);
7416             decode_sei(h);
7417             break;
7418         case NAL_SPS:
7419             init_get_bits(&s->gb, ptr, bit_length);
7420             decode_seq_parameter_set(h);
7421
7422             if(s->flags& CODEC_FLAG_LOW_DELAY)
7423                 s->low_delay=1;
7424
7425             if(avctx->has_b_frames < 2)
7426                 avctx->has_b_frames= !s->low_delay;
7427             break;
7428         case NAL_PPS:
7429             init_get_bits(&s->gb, ptr, bit_length);
7430
7431             decode_picture_parameter_set(h, bit_length);
7432
7433             break;
7434         case NAL_AUD:
7435         case NAL_END_SEQUENCE:
7436         case NAL_END_STREAM:
7437         case NAL_FILLER_DATA:
7438         case NAL_SPS_EXT:
7439         case NAL_AUXILIARY_SLICE:
7440             break;
7441         default:
7442             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7443         }
7444
7445         if(context_count == h->max_contexts) {
7446             execute_decode_slices(h, context_count);
7447             context_count = 0;
7448         }
7449
7450         if (err < 0)
7451             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7452         else if(err == 1) {
7453             /* Slice could not be decoded in parallel mode, copy down
7454              * NAL unit stuff to context 0 and restart. Note that
7455              * rbsp_buffer is not transferred, but since we no longer
7456              * run in parallel mode this should not be an issue. */
7457             h->nal_unit_type = hx->nal_unit_type;
7458             h->nal_ref_idc   = hx->nal_ref_idc;
7459             hx = h;
7460             goto again;
7461         }
7462     }
7463     if(context_count)
7464         execute_decode_slices(h, context_count);
7465     return buf_index;
7466 }
7467
7468 /**
7469  * returns the number of bytes consumed for building the current frame
7470  */
7471 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7472         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7473         if(pos+10>buf_size) pos=buf_size; // oops ;)
7474
7475         return pos;
7476 }
7477
7478 static int decode_frame(AVCodecContext *avctx,
7479                              void *data, int *data_size,
7480                              const uint8_t *buf, int buf_size)
7481 {
7482     H264Context *h = avctx->priv_data;
7483     MpegEncContext *s = &h->s;
7484     AVFrame *pict = data;
7485     int buf_index;
7486
7487     s->flags= avctx->flags;
7488     s->flags2= avctx->flags2;
7489
7490    /* end of stream, output what is still in the buffers */
7491     if (buf_size == 0) {
7492         Picture *out;
7493         int i, out_idx;
7494
7495 //FIXME factorize this with the output code below
7496         out = h->delayed_pic[0];
7497         out_idx = 0;
7498         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7499             if(h->delayed_pic[i]->poc < out->poc){
7500                 out = h->delayed_pic[i];
7501                 out_idx = i;
7502             }
7503
7504         for(i=out_idx; h->delayed_pic[i]; i++)
7505             h->delayed_pic[i] = h->delayed_pic[i+1];
7506
7507         if(out){
7508             *data_size = sizeof(AVFrame);
7509             *pict= *(AVFrame*)out;
7510         }
7511
7512         return 0;
7513     }
7514
7515     if(h->is_avc && !h->got_avcC) {
7516         int i, cnt, nalsize;
7517         unsigned char *p = avctx->extradata;
7518         if(avctx->extradata_size < 7) {
7519             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7520             return -1;
7521         }
7522         if(*p != 1) {
7523             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7524             return -1;
7525         }
7526         /* sps and pps in the avcC always have length coded with 2 bytes,
7527            so put a fake nal_length_size = 2 while parsing them */
7528         h->nal_length_size = 2;
7529         // Decode sps from avcC
7530         cnt = *(p+5) & 0x1f; // Number of sps
7531         p += 6;
7532         for (i = 0; i < cnt; i++) {
7533             nalsize = AV_RB16(p) + 2;
7534             if(decode_nal_units(h, p, nalsize) < 0) {
7535                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7536                 return -1;
7537             }
7538             p += nalsize;
7539         }
7540         // Decode pps from avcC
7541         cnt = *(p++); // Number of pps
7542         for (i = 0; i < cnt; i++) {
7543             nalsize = AV_RB16(p) + 2;
7544             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7545                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7546                 return -1;
7547             }
7548             p += nalsize;
7549         }
7550         // Now store right nal length size, that will be use to parse all other nals
7551         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7552         // Do not reparse avcC
7553         h->got_avcC = 1;
7554     }
7555
7556     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7557         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7558             return -1;
7559         h->got_avcC = 1;
7560     }
7561
7562     buf_index=decode_nal_units(h, buf, buf_size);
7563     if(buf_index < 0)
7564         return -1;
7565
7566     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7567         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7568         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7569         return -1;
7570     }
7571
7572     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7573         Picture *out = s->current_picture_ptr;
7574         Picture *cur = s->current_picture_ptr;
7575         int i, pics, cross_idr, out_of_order, out_idx;
7576
7577         s->mb_y= 0;
7578
7579         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7580         s->current_picture_ptr->pict_type= s->pict_type;
7581
7582         if(!s->dropable) {
7583             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7584             h->prev_poc_msb= h->poc_msb;
7585             h->prev_poc_lsb= h->poc_lsb;
7586         }
7587         h->prev_frame_num_offset= h->frame_num_offset;
7588         h->prev_frame_num= h->frame_num;
7589
7590         /*
7591          * FIXME: Error handling code does not seem to support interlaced
7592          * when slices span multiple rows
7593          * The ff_er_add_slice calls don't work right for bottom
7594          * fields; they cause massive erroneous error concealing
7595          * Error marking covers both fields (top and bottom).
7596          * This causes a mismatched s->error_count
7597          * and a bad error table. Further, the error count goes to
7598          * INT_MAX when called for bottom field, because mb_y is
7599          * past end by one (callers fault) and resync_mb_y != 0
7600          * causes problems for the first MB line, too.
7601          */
7602         if (!FIELD_PICTURE)
7603             ff_er_frame_end(s);
7604
7605         MPV_frame_end(s);
7606
7607         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7608             /* Wait for second field. */
7609             *data_size = 0;
7610
7611         } else {
7612             cur->repeat_pict = 0;
7613
7614             /* Signal interlacing information externally. */
7615             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7616             if(h->sps.pic_struct_present_flag){
7617                 switch (h->sei_pic_struct)
7618                 {
7619                 case SEI_PIC_STRUCT_FRAME:
7620                     cur->interlaced_frame = 0;
7621                     break;
7622                 case SEI_PIC_STRUCT_TOP_FIELD:
7623                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7624                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7625                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7626                     cur->interlaced_frame = 1;
7627                     break;
7628                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7629                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7630                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7631                     // From these hints, let the applications decide if they apply deinterlacing.
7632                     cur->repeat_pict = 1;
7633                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7634                     break;
7635                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7636                     // Force progressive here, as doubling interlaced frame is a bad idea.
7637                     cur->interlaced_frame = 0;
7638                     cur->repeat_pict = 2;
7639                     break;
7640                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7641                     cur->interlaced_frame = 0;
7642                     cur->repeat_pict = 4;
7643                     break;
7644                 }
7645             }else{
7646                 /* Derive interlacing flag from used decoding process. */
7647                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7648             }
7649
7650             if (cur->field_poc[0] != cur->field_poc[1]){
7651                 /* Derive top_field_first from field pocs. */
7652                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7653             }else{
7654                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7655                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7656                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7657                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7658                         cur->top_field_first = 1;
7659                     else
7660                         cur->top_field_first = 0;
7661                 }else{
7662                     /* Most likely progressive */
7663                     cur->top_field_first = 0;
7664                 }
7665             }
7666
7667         //FIXME do something with unavailable reference frames
7668
7669             /* Sort B-frames into display order */
7670
7671             if(h->sps.bitstream_restriction_flag
7672                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7673                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7674                 s->low_delay = 0;
7675             }
7676
7677             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7678                && !h->sps.bitstream_restriction_flag){
7679                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7680                 s->low_delay= 0;
7681             }
7682
7683             pics = 0;
7684             while(h->delayed_pic[pics]) pics++;
7685
7686             assert(pics <= MAX_DELAYED_PIC_COUNT);
7687
7688             h->delayed_pic[pics++] = cur;
7689             if(cur->reference == 0)
7690                 cur->reference = DELAYED_PIC_REF;
7691
7692             out = h->delayed_pic[0];
7693             out_idx = 0;
7694             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7695                 if(h->delayed_pic[i]->poc < out->poc){
7696                     out = h->delayed_pic[i];
7697                     out_idx = i;
7698                 }
7699             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7700
7701             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7702
7703             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7704                 { }
7705             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7706                || (s->low_delay &&
7707                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7708                  || cur->pict_type == FF_B_TYPE)))
7709             {
7710                 s->low_delay = 0;
7711                 s->avctx->has_b_frames++;
7712             }
7713
7714             if(out_of_order || pics > s->avctx->has_b_frames){
7715                 out->reference &= ~DELAYED_PIC_REF;
7716                 for(i=out_idx; h->delayed_pic[i]; i++)
7717                     h->delayed_pic[i] = h->delayed_pic[i+1];
7718             }
7719             if(!out_of_order && pics > s->avctx->has_b_frames){
7720                 *data_size = sizeof(AVFrame);
7721
7722                 h->outputed_poc = out->poc;
7723                 *pict= *(AVFrame*)out;
7724             }else{
7725                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7726             }
7727         }
7728     }
7729
7730     assert(pict->data[0] || !*data_size);
7731     ff_print_debug_info(s, pict);
7732 //printf("out %d\n", (int)pict->data[0]);
7733 #if 0 //?
7734
7735     /* Return the Picture timestamp as the frame number */
7736     /* we subtract 1 because it is added on utils.c     */
7737     avctx->frame_number = s->picture_number - 1;
7738 #endif
7739     return get_consumed_bytes(s, buf_index, buf_size);
7740 }
7741 #if 0
7742 static inline void fill_mb_avail(H264Context *h){
7743     MpegEncContext * const s = &h->s;
7744     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7745
7746     if(s->mb_y){
7747         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7748         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7749         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7750     }else{
7751         h->mb_avail[0]=
7752         h->mb_avail[1]=
7753         h->mb_avail[2]= 0;
7754     }
7755     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7756     h->mb_avail[4]= 1; //FIXME move out
7757     h->mb_avail[5]= 0; //FIXME move out
7758 }
7759 #endif
7760
7761 #ifdef TEST
7762 #undef printf
7763 #undef random
7764 #define COUNT 8000
7765 #define SIZE (COUNT*40)
7766 int main(void){
7767     int i;
7768     uint8_t temp[SIZE];
7769     PutBitContext pb;
7770     GetBitContext gb;
7771 //    int int_temp[10000];
7772     DSPContext dsp;
7773     AVCodecContext avctx;
7774
7775     dsputil_init(&dsp, &avctx);
7776
7777     init_put_bits(&pb, temp, SIZE);
7778     printf("testing unsigned exp golomb\n");
7779     for(i=0; i<COUNT; i++){
7780         START_TIMER
7781         set_ue_golomb(&pb, i);
7782         STOP_TIMER("set_ue_golomb");
7783     }
7784     flush_put_bits(&pb);
7785
7786     init_get_bits(&gb, temp, 8*SIZE);
7787     for(i=0; i<COUNT; i++){
7788         int j, s;
7789
7790         s= show_bits(&gb, 24);
7791
7792         START_TIMER
7793         j= get_ue_golomb(&gb);
7794         if(j != i){
7795             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7796 //            return -1;
7797         }
7798         STOP_TIMER("get_ue_golomb");
7799     }
7800
7801
7802     init_put_bits(&pb, temp, SIZE);
7803     printf("testing signed exp golomb\n");
7804     for(i=0; i<COUNT; i++){
7805         START_TIMER
7806         set_se_golomb(&pb, i - COUNT/2);
7807         STOP_TIMER("set_se_golomb");
7808     }
7809     flush_put_bits(&pb);
7810
7811     init_get_bits(&gb, temp, 8*SIZE);
7812     for(i=0; i<COUNT; i++){
7813         int j, s;
7814
7815         s= show_bits(&gb, 24);
7816
7817         START_TIMER
7818         j= get_se_golomb(&gb);
7819         if(j != i - COUNT/2){
7820             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7821 //            return -1;
7822         }
7823         STOP_TIMER("get_se_golomb");
7824     }
7825
7826 #if 0
7827     printf("testing 4x4 (I)DCT\n");
7828
7829     DCTELEM block[16];
7830     uint8_t src[16], ref[16];
7831     uint64_t error= 0, max_error=0;
7832
7833     for(i=0; i<COUNT; i++){
7834         int j;
7835 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7836         for(j=0; j<16; j++){
7837             ref[j]= random()%255;
7838             src[j]= random()%255;
7839         }
7840
7841         h264_diff_dct_c(block, src, ref, 4);
7842
7843         //normalize
7844         for(j=0; j<16; j++){
7845 //            printf("%d ", block[j]);
7846             block[j]= block[j]*4;
7847             if(j&1) block[j]= (block[j]*4 + 2)/5;
7848             if(j&4) block[j]= (block[j]*4 + 2)/5;
7849         }
7850 //        printf("\n");
7851
7852         s->dsp.h264_idct_add(ref, block, 4);
7853 /*        for(j=0; j<16; j++){
7854             printf("%d ", ref[j]);
7855         }
7856         printf("\n");*/
7857
7858         for(j=0; j<16; j++){
7859             int diff= FFABS(src[j] - ref[j]);
7860
7861             error+= diff*diff;
7862             max_error= FFMAX(max_error, diff);
7863         }
7864     }
7865     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7866     printf("testing quantizer\n");
7867     for(qp=0; qp<52; qp++){
7868         for(i=0; i<16; i++)
7869             src1_block[i]= src2_block[i]= random()%255;
7870
7871     }
7872     printf("Testing NAL layer\n");
7873
7874     uint8_t bitstream[COUNT];
7875     uint8_t nal[COUNT*2];
7876     H264Context h;
7877     memset(&h, 0, sizeof(H264Context));
7878
7879     for(i=0; i<COUNT; i++){
7880         int zeros= i;
7881         int nal_length;
7882         int consumed;
7883         int out_length;
7884         uint8_t *out;
7885         int j;
7886
7887         for(j=0; j<COUNT; j++){
7888             bitstream[j]= (random() % 255) + 1;
7889         }
7890
7891         for(j=0; j<zeros; j++){
7892             int pos= random() % COUNT;
7893             while(bitstream[pos] == 0){
7894                 pos++;
7895                 pos %= COUNT;
7896             }
7897             bitstream[pos]=0;
7898         }
7899
7900         START_TIMER
7901
7902         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7903         if(nal_length<0){
7904             printf("encoding failed\n");
7905             return -1;
7906         }
7907
7908         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7909
7910         STOP_TIMER("NAL")
7911
7912         if(out_length != COUNT){
7913             printf("incorrect length %d %d\n", out_length, COUNT);
7914             return -1;
7915         }
7916
7917         if(consumed != nal_length){
7918             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7919             return -1;
7920         }
7921
7922         if(memcmp(bitstream, out, COUNT)){
7923             printf("mismatch\n");
7924             return -1;
7925         }
7926     }
7927 #endif
7928
7929     printf("Testing RBSP\n");
7930
7931
7932     return 0;
7933 }
7934 #endif /* TEST */
7935
7936
7937 static av_cold int decode_end(AVCodecContext *avctx)
7938 {
7939     H264Context *h = avctx->priv_data;
7940     MpegEncContext *s = &h->s;
7941     int i;
7942
7943     av_freep(&h->rbsp_buffer[0]);
7944     av_freep(&h->rbsp_buffer[1]);
7945     free_tables(h); //FIXME cleanup init stuff perhaps
7946
7947     for(i = 0; i < MAX_SPS_COUNT; i++)
7948         av_freep(h->sps_buffers + i);
7949
7950     for(i = 0; i < MAX_PPS_COUNT; i++)
7951         av_freep(h->pps_buffers + i);
7952
7953     MPV_common_end(s);
7954
7955 //    memset(h, 0, sizeof(H264Context));
7956
7957     return 0;
7958 }
7959
7960
7961 AVCodec h264_decoder = {
7962     "h264",
7963     CODEC_TYPE_VIDEO,
7964     CODEC_ID_H264,
7965     sizeof(H264Context),
7966     decode_init,
7967     NULL,
7968     decode_end,
7969     decode_frame,
7970     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7971     .flush= flush_dpb,
7972     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7973 };
7974
7975 #include "svq3.c"