libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #if HAVE_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996     assert(h->ref_list[1][0].reference&3);
 997
 998 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 999
1000     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1001         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1002             int cur_poc = s->current_picture_ptr->poc;
1003             int *col_poc = h->ref_list[1]->field_poc;
1004             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1005             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1006             b8_stride = 0;
1007         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1008             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1009             mb_xy += s->mb_stride*fieldoff;
1010         }
1011         goto single_col;
1012     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1013         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1014             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1016             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1017             b8_stride *= 3;
1018             b4_stride *= 6;
1019             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1020             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1021                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1022                 && !is_b8x8){
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1025             }else{
1026                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1027                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1028             }
1029         }else{                                           //     AFR/FR    -> AFR/FR
1030 single_col:
1031             mb_type_col[0] =
1032             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1033             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1034                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1035                 * so we know exactly what block size to use */
1036                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1037                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1038             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1041             }else{
1042                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1043                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1044             }
1045         }
1046     }
1047
1048     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1049     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1050     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1051     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1052     if(!b8_stride){
1053         if(s->mb_y&1){
1054             l1ref0 += h->b8_stride;
1055             l1ref1 += h->b8_stride;
1056             l1mv0  +=  2*b4_stride;
1057             l1mv1  +=  2*b4_stride;
1058         }
1059     }
1060
1061     if(h->direct_spatial_mv_pred){
1062         int ref[2];
1063         int mv[2][2];
1064         int list;
1065
1066         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1067
1068         /* ref = min(neighbors) */
1069         for(list=0; list<2; list++){
1070             int refa = h->ref_cache[list][scan8[0] - 1];
1071             int refb = h->ref_cache[list][scan8[0] - 8];
1072             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1073             if(refc == PART_NOT_AVAILABLE)
1074                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1075             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1076             if(ref[list] < 0)
1077                 ref[list] = -1;
1078         }
1079
1080         if(ref[0] < 0 && ref[1] < 0){
1081             ref[0] = ref[1] = 0;
1082             mv[0][0] = mv[0][1] =
1083             mv[1][0] = mv[1][1] = 0;
1084         }else{
1085             for(list=0; list<2; list++){
1086                 if(ref[list] >= 0)
1087                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1088                 else
1089                     mv[list][0] = mv[list][1] = 0;
1090             }
1091         }
1092
1093         if(ref[1] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L1;
1096             sub_mb_type &= ~MB_TYPE_L1;
1097         }else if(ref[0] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L0;
1100             sub_mb_type &= ~MB_TYPE_L0;
1101         }
1102
1103         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1104             for(i8=0; i8<4; i8++){
1105                 int x8 = i8&1;
1106                 int y8 = i8>>1;
1107                 int xy8 = x8+y8*b8_stride;
1108                 int xy4 = 3*x8+y8*b4_stride;
1109                 int a=0, b=0;
1110
1111                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1112                     continue;
1113                 h->sub_mb_type[i8] = sub_mb_type;
1114
1115                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1116                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1117                 if(!IS_INTRA(mb_type_col[y8])
1118                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1119                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1120                     if(ref[0] > 0)
1121                         a= pack16to32(mv[0][0],mv[0][1]);
1122                     if(ref[1] > 0)
1123                         b= pack16to32(mv[1][0],mv[1][1]);
1124                 }else{
1125                     a= pack16to32(mv[0][0],mv[0][1]);
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127                 }
1128                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1129                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1130             }
1131         }else if(IS_16X16(*mb_type)){
1132             int a=0, b=0;
1133
1134             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1135             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1136             if(!IS_INTRA(mb_type_col[0])
1137                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1138                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1139                        && (h->x264_build>33 || !h->x264_build)))){
1140                 if(ref[0] > 0)
1141                     a= pack16to32(mv[0][0],mv[0][1]);
1142                 if(ref[1] > 0)
1143                     b= pack16to32(mv[1][0],mv[1][1]);
1144             }else{
1145                 a= pack16to32(mv[0][0],mv[0][1]);
1146                 b= pack16to32(mv[1][0],mv[1][1]);
1147             }
1148             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1149             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1150         }else{
1151             for(i8=0; i8<4; i8++){
1152                 const int x8 = i8&1;
1153                 const int y8 = i8>>1;
1154
1155                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1156                     continue;
1157                 h->sub_mb_type[i8] = sub_mb_type;
1158
1159                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1160                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1161                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1162                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1163
1164                 /* col_zero_flag */
1165                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1166                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1167                                                   && (h->x264_build>33 || !h->x264_build)))){
1168                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1169                     if(IS_SUB_8X8(sub_mb_type)){
1170                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1171                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1172                             if(ref[0] == 0)
1173                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                             if(ref[1] == 0)
1175                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1176                         }
1177                     }else
1178                     for(i4=0; i4<4; i4++){
1179                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1180                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1181                             if(ref[0] == 0)
1182                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1183                             if(ref[1] == 0)
1184                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1185                         }
1186                     }
1187                 }
1188             }
1189         }
1190     }else{ /* direct temporal mv pred */
1191         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1192         const int *dist_scale_factor = h->dist_scale_factor;
1193         int ref_offset= 0;
1194
1195         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1196             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1197             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1198             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1199         }
1200         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1201             ref_offset += 16;
1202
1203         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1204             /* FIXME assumes direct_8x8_inference == 1 */
1205             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1206
1207             for(i8=0; i8<4; i8++){
1208                 const int x8 = i8&1;
1209                 const int y8 = i8>>1;
1210                 int ref0, scale;
1211                 const int16_t (*l1mv)[2]= l1mv0;
1212
1213                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1214                     continue;
1215                 h->sub_mb_type[i8] = sub_mb_type;
1216
1217                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                 if(IS_INTRA(mb_type_col[y8])){
1219                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1221                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1222                     continue;
1223                 }
1224
1225                 ref0 = l1ref0[x8 + y8*b8_stride];
1226                 if(ref0 >= 0)
1227                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1228                 else{
1229                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1230                     l1mv= l1mv1;
1231                 }
1232                 scale = dist_scale_factor[ref0];
1233                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1234
1235                 {
1236                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1237                     int my_col = (mv_col[1]<<y_shift)/2;
1238                     int mx = (scale * mv_col[0] + 128) >> 8;
1239                     int my = (scale * my_col + 128) >> 8;
1240                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1241                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1242                 }
1243             }
1244             return;
1245         }
1246
1247         /* one-to-one mv scaling */
1248
1249         if(IS_16X16(*mb_type)){
1250             int ref, mv0, mv1;
1251
1252             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1253             if(IS_INTRA(mb_type_col[0])){
1254                 ref=mv0=mv1=0;
1255             }else{
1256                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1257                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1258                 const int scale = dist_scale_factor[ref0];
1259                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1260                 int mv_l0[2];
1261                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1262                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1263                 ref= ref0;
1264                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1265                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1266             }
1267             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1268             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1269             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1270         }else{
1271             for(i8=0; i8<4; i8++){
1272                 const int x8 = i8&1;
1273                 const int y8 = i8>>1;
1274                 int ref0, scale;
1275                 const int16_t (*l1mv)[2]= l1mv0;
1276
1277                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1278                     continue;
1279                 h->sub_mb_type[i8] = sub_mb_type;
1280                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                 if(IS_INTRA(mb_type_col[0])){
1282                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1283                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1284                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1285                     continue;
1286                 }
1287
1288                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1289                 if(ref0 >= 0)
1290                     ref0 = map_col_to_list0[0][ref0];
1291                 else{
1292                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1293                     l1mv= l1mv1;
1294                 }
1295                 scale = dist_scale_factor[ref0];
1296
1297                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1298                 if(IS_SUB_8X8(sub_mb_type)){
1299                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1300                     int mx = (scale * mv_col[0] + 128) >> 8;
1301                     int my = (scale * mv_col[1] + 128) >> 8;
1302                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1303                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1304                 }else
1305                 for(i4=0; i4<4; i4++){
1306                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1307                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1308                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1309                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1310                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1311                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1312                 }
1313             }
1314         }
1315     }
1316 }
1317
1318 static inline void write_back_motion(H264Context *h, int mb_type){
1319     MpegEncContext * const s = &h->s;
1320     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1321     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1322     int list;
1323
1324     if(!USES_LIST(mb_type, 0))
1325         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1326
1327     for(list=0; list<h->list_count; list++){
1328         int y;
1329         if(!USES_LIST(mb_type, list))
1330             continue;
1331
1332         for(y=0; y<4; y++){
1333             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1334             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1335         }
1336         if( h->pps.cabac ) {
1337             if(IS_SKIP(mb_type))
1338                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1339             else
1340             for(y=0; y<4; y++){
1341                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1342                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1343             }
1344         }
1345
1346         {
1347             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1348             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1349             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1350             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1351             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1352         }
1353     }
1354
1355     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1356         if(IS_8X8(mb_type)){
1357             uint8_t *direct_table = &h->direct_table[b8_xy];
1358             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1359             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1360             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1361         }
1362     }
1363 }
1364
1365 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1366     int i, si, di;
1367     uint8_t *dst;
1368     int bufidx;
1369
1370 //    src[0]&0x80;                //forbidden bit
1371     h->nal_ref_idc= src[0]>>5;
1372     h->nal_unit_type= src[0]&0x1F;
1373
1374     src++; length--;
1375 #if 0
1376     for(i=0; i<length; i++)
1377         printf("%2X ", src[i]);
1378 #endif
1379
1380 #if HAVE_FAST_UNALIGNED
1381 # if HAVE_FAST_64BIT
1382 #   define RS 7
1383     for(i=0; i+1<length; i+=9){
1384         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1385 # else
1386 #   define RS 3
1387     for(i=0; i+1<length; i+=5){
1388         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1389 # endif
1390             continue;
1391         if(i>0 && !src[i]) i--;
1392         while(src[i]) i++;
1393 #else
1394 #   define RS 0
1395     for(i=0; i+1<length; i+=2){
1396         if(src[i]) continue;
1397         if(i>0 && src[i-1]==0) i--;
1398 #endif
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406         i-= RS;
1407     }
1408
1409     if(i>=length-1){ //no escaped 0
1410         *dst_length= length;
1411         *consumed= length+1; //+1 for the header
1412         return src;
1413     }
1414
1415     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1416     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1417     dst= h->rbsp_buffer[bufidx];
1418
1419     if (dst == NULL){
1420         return NULL;
1421     }
1422
1423 //printf("decoding esc\n");
1424     memcpy(dst, src, i);
1425     si=di=i;
1426     while(si+2<length){
1427         //remove escapes (very rare 1:2^22)
1428         if(src[si+2]>3){
1429             dst[di++]= src[si++];
1430             dst[di++]= src[si++];
1431         }else if(src[si]==0 && src[si+1]==0){
1432             if(src[si+2]==3){ //escape
1433                 dst[di++]= 0;
1434                 dst[di++]= 0;
1435                 si+=3;
1436                 continue;
1437             }else //next start code
1438                 goto nsc;
1439         }
1440
1441         dst[di++]= src[si++];
1442     }
1443     while(si<length)
1444         dst[di++]= src[si++];
1445 nsc:
1446
1447     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1448
1449     *dst_length= di;
1450     *consumed= si + 1;//+1 for the header
1451 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1452     return dst;
1453 }
1454
1455 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1456     int v= *src;
1457     int r;
1458
1459     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1460
1461     for(r=1; r<9; r++){
1462         if(v&1) return r;
1463         v>>=1;
1464     }
1465     return 0;
1466 }
1467
1468 /**
1469  * IDCT transforms the 16 dc values and dequantizes them.
1470  * @param qp quantization parameter
1471  */
1472 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1473 #define stride 16
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479 //memset(block, 64, 2*256);
1480 //return;
1481     for(i=0; i<4; i++){
1482         const int offset= y_offset[i];
1483         const int z0= block[offset+stride*0] + block[offset+stride*4];
1484         const int z1= block[offset+stride*0] - block[offset+stride*4];
1485         const int z2= block[offset+stride*1] - block[offset+stride*5];
1486         const int z3= block[offset+stride*1] + block[offset+stride*5];
1487
1488         temp[4*i+0]= z0+z3;
1489         temp[4*i+1]= z1+z2;
1490         temp[4*i+2]= z1-z2;
1491         temp[4*i+3]= z0-z3;
1492     }
1493
1494     for(i=0; i<4; i++){
1495         const int offset= x_offset[i];
1496         const int z0= temp[4*0+i] + temp[4*2+i];
1497         const int z1= temp[4*0+i] - temp[4*2+i];
1498         const int z2= temp[4*1+i] - temp[4*3+i];
1499         const int z3= temp[4*1+i] + temp[4*3+i];
1500
1501         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1502         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1503         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1504         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1505     }
1506 }
1507
1508 #if 0
1509 /**
1510  * DCT transforms the 16 dc values.
1511  * @param qp quantization parameter ??? FIXME
1512  */
1513 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1514 //    const int qmul= dequant_coeff[qp][0];
1515     int i;
1516     int temp[16]; //FIXME check if this is a good idea
1517     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1518     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1519
1520     for(i=0; i<4; i++){
1521         const int offset= y_offset[i];
1522         const int z0= block[offset+stride*0] + block[offset+stride*4];
1523         const int z1= block[offset+stride*0] - block[offset+stride*4];
1524         const int z2= block[offset+stride*1] - block[offset+stride*5];
1525         const int z3= block[offset+stride*1] + block[offset+stride*5];
1526
1527         temp[4*i+0]= z0+z3;
1528         temp[4*i+1]= z1+z2;
1529         temp[4*i+2]= z1-z2;
1530         temp[4*i+3]= z0-z3;
1531     }
1532
1533     for(i=0; i<4; i++){
1534         const int offset= x_offset[i];
1535         const int z0= temp[4*0+i] + temp[4*2+i];
1536         const int z1= temp[4*0+i] - temp[4*2+i];
1537         const int z2= temp[4*1+i] - temp[4*3+i];
1538         const int z3= temp[4*1+i] + temp[4*3+i];
1539
1540         block[stride*0 +offset]= (z0 + z3)>>1;
1541         block[stride*2 +offset]= (z1 + z2)>>1;
1542         block[stride*8 +offset]= (z1 - z2)>>1;
1543         block[stride*10+offset]= (z0 - z3)>>1;
1544     }
1545 }
1546 #endif
1547
1548 #undef xStride
1549 #undef stride
1550
1551 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1552     const int stride= 16*2;
1553     const int xStride= 16;
1554     int a,b,c,d,e;
1555
1556     a= block[stride*0 + xStride*0];
1557     b= block[stride*0 + xStride*1];
1558     c= block[stride*1 + xStride*0];
1559     d= block[stride*1 + xStride*1];
1560
1561     e= a-b;
1562     a= a+b;
1563     b= c-d;
1564     c= c+d;
1565
1566     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1567     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1568     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1569     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1570 }
1571
1572 #if 0
1573 static void chroma_dc_dct_c(DCTELEM *block){
1574     const int stride= 16*2;
1575     const int xStride= 16;
1576     int a,b,c,d,e;
1577
1578     a= block[stride*0 + xStride*0];
1579     b= block[stride*0 + xStride*1];
1580     c= block[stride*1 + xStride*0];
1581     d= block[stride*1 + xStride*1];
1582
1583     e= a-b;
1584     a= a+b;
1585     b= c-d;
1586     c= c+d;
1587
1588     block[stride*0 + xStride*0]= (a+c);
1589     block[stride*0 + xStride*1]= (e+b);
1590     block[stride*1 + xStride*0]= (a-c);
1591     block[stride*1 + xStride*1]= (e-b);
1592 }
1593 #endif
1594
1595 /**
1596  * gets the chroma qp.
1597  */
1598 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1599     return h->pps.chroma_qp_table[t][qscale];
1600 }
1601
1602 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1603                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1604                            int src_x_offset, int src_y_offset,
1605                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1606     MpegEncContext * const s = &h->s;
1607     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1608     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1609     const int luma_xy= (mx&3) + ((my&3)<<2);
1610     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1611     uint8_t * src_cb, * src_cr;
1612     int extra_width= h->emu_edge_width;
1613     int extra_height= h->emu_edge_height;
1614     int emu=0;
1615     const int full_mx= mx>>2;
1616     const int full_my= my>>2;
1617     const int pic_width  = 16*s->mb_width;
1618     const int pic_height = 16*s->mb_height >> MB_FIELD;
1619
1620     if(mx&7) extra_width -= 3;
1621     if(my&7) extra_height -= 3;
1622
1623     if(   full_mx < 0-extra_width
1624        || full_my < 0-extra_height
1625        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1626        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1627         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1628             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1629         emu=1;
1630     }
1631
1632     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1633     if(!square){
1634         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1635     }
1636
1637     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1638
1639     if(MB_FIELD){
1640         // chroma offset when predicting from a field of opposite parity
1641         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1642         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1643     }
1644     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1645     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1646
1647     if(emu){
1648         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1649             src_cb= s->edge_emu_buffer;
1650     }
1651     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cr= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658 }
1659
1660 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1661                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1662                            int x_offset, int y_offset,
1663                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1664                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1665                            int list0, int list1){
1666     MpegEncContext * const s = &h->s;
1667     qpel_mc_func *qpix_op=  qpix_put;
1668     h264_chroma_mc_func chroma_op= chroma_put;
1669
1670     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1671     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1672     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1673     x_offset += 8*s->mb_x;
1674     y_offset += 8*(s->mb_y >> MB_FIELD);
1675
1676     if(list0){
1677         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681
1682         qpix_op=  qpix_avg;
1683         chroma_op= chroma_avg;
1684     }
1685
1686     if(list1){
1687         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1688         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1689                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1690                            qpix_op, chroma_op);
1691     }
1692 }
1693
1694 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1699                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1700                            int list0, int list1){
1701     MpegEncContext * const s = &h->s;
1702
1703     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1704     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1705     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     x_offset += 8*s->mb_x;
1707     y_offset += 8*(s->mb_y >> MB_FIELD);
1708
1709     if(list0 && list1){
1710         /* don't optimize for luma-only case, since B-frames usually
1711          * use implicit weights => chroma too. */
1712         uint8_t *tmp_cb = s->obmc_scratchpad;
1713         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1714         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1715         int refn0 = h->ref_cache[0][ scan8[n] ];
1716         int refn1 = h->ref_cache[1][ scan8[n] ];
1717
1718         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1719                     dest_y, dest_cb, dest_cr,
1720                     x_offset, y_offset, qpix_put, chroma_put);
1721         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1722                     tmp_y, tmp_cb, tmp_cr,
1723                     x_offset, y_offset, qpix_put, chroma_put);
1724
1725         if(h->use_weight == 2){
1726             int weight0 = h->implicit_weight[refn0][refn1];
1727             int weight1 = 64 - weight0;
1728             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1729             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1730             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1731         }else{
1732             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1733                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1734                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1736                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1737                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1739                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1740                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1741         }
1742     }else{
1743         int list = list1 ? 1 : 0;
1744         int refn = h->ref_cache[list][ scan8[n] ];
1745         Picture *ref= &h->ref_list[list][refn];
1746         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1747                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1748                     qpix_put, chroma_put);
1749
1750         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1751                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1752         if(h->use_weight_chroma){
1753             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1755             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1756                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1757         }
1758     }
1759 }
1760
1761 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1762                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1763                            int x_offset, int y_offset,
1764                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1765                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1766                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1767                            int list0, int list1){
1768     if((h->use_weight==2 && list0 && list1
1769         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1770        || h->use_weight==1)
1771         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1772                          x_offset, y_offset, qpix_put, chroma_put,
1773                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1774     else
1775         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1776                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1777 }
1778
1779 static inline void prefetch_motion(H264Context *h, int list){
1780     /* fetch pixels for estimated mv 4 macroblocks ahead
1781      * optimized for 64byte cache lines */
1782     MpegEncContext * const s = &h->s;
1783     const int refn = h->ref_cache[list][scan8[0]];
1784     if(refn >= 0){
1785         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1786         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1787         uint8_t **src= h->ref_list[list][refn].data;
1788         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1789         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1790         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1791         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1792     }
1793 }
1794
1795 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1796                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1797                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1798                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1799     MpegEncContext * const s = &h->s;
1800     const int mb_xy= h->mb_xy;
1801     const int mb_type= s->current_picture.mb_type[mb_xy];
1802
1803     assert(IS_INTER(mb_type));
1804
1805     prefetch_motion(h, 0);
1806
1807     if(IS_16X16(mb_type)){
1808         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1809                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1810                 &weight_op[0], &weight_avg[0],
1811                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1812     }else if(IS_16X8(mb_type)){
1813         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1815                 &weight_op[1], &weight_avg[1],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1818                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1819                 &weight_op[1], &weight_avg[1],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else if(IS_8X16(mb_type)){
1822         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1823                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1824                 &weight_op[2], &weight_avg[2],
1825                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1826         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1827                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1828                 &weight_op[2], &weight_avg[2],
1829                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1830     }else{
1831         int i;
1832
1833         assert(IS_8X8(mb_type));
1834
1835         for(i=0; i<4; i++){
1836             const int sub_mb_type= h->sub_mb_type[i];
1837             const int n= 4*i;
1838             int x_offset= (i&1)<<2;
1839             int y_offset= (i&2)<<1;
1840
1841             if(IS_SUB_8X8(sub_mb_type)){
1842                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1843                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1844                     &weight_op[3], &weight_avg[3],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_8X4(sub_mb_type)){
1847                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1849                     &weight_op[4], &weight_avg[4],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1852                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1853                     &weight_op[4], &weight_avg[4],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else if(IS_SUB_4X8(sub_mb_type)){
1856                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1857                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1858                     &weight_op[5], &weight_avg[5],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1861                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                     &weight_op[5], &weight_avg[5],
1863                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864             }else{
1865                 int j;
1866                 assert(IS_SUB_4X4(sub_mb_type));
1867                 for(j=0; j<4; j++){
1868                     int sub_x_offset= x_offset + 2*(j&1);
1869                     int sub_y_offset= y_offset +   (j&2);
1870                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1871                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1872                         &weight_op[6], &weight_avg[6],
1873                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1874                 }
1875             }
1876         }
1877     }
1878
1879     prefetch_motion(h, 1);
1880 }
1881
1882 static av_cold void init_cavlc_level_tab(void){
1883     int suffix_length, mask;
1884     unsigned int i;
1885
1886     for(suffix_length=0; suffix_length<7; suffix_length++){
1887         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1888             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1889             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1890
1891             mask= -(level_code&1);
1892             level_code= (((2+level_code)>>1) ^ mask) - mask;
1893             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1894                 cavlc_level_tab[suffix_length][i][0]= level_code;
1895                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1896             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1897                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1898                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1899             }else{
1900                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1901                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1902             }
1903         }
1904     }
1905 }
1906
1907 static av_cold void decode_init_vlc(void){
1908     static int done = 0;
1909
1910     if (!done) {
1911         int i;
1912         int offset;
1913         done = 1;
1914
1915         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1916         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1917         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1918                  &chroma_dc_coeff_token_len [0], 1, 1,
1919                  &chroma_dc_coeff_token_bits[0], 1, 1,
1920                  INIT_VLC_USE_NEW_STATIC);
1921
1922         offset = 0;
1923         for(i=0; i<4; i++){
1924             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1925             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1926             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1927                      &coeff_token_len [i][0], 1, 1,
1928                      &coeff_token_bits[i][0], 1, 1,
1929                      INIT_VLC_USE_NEW_STATIC);
1930             offset += coeff_token_vlc_tables_size[i];
1931         }
1932         /*
1933          * This is a one time safety check to make sure that
1934          * the packed static coeff_token_vlc table sizes
1935          * were initialized correctly.
1936          */
1937         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1938
1939         for(i=0; i<3; i++){
1940             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1941             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1942             init_vlc(&chroma_dc_total_zeros_vlc[i],
1943                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1944                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1945                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1946                      INIT_VLC_USE_NEW_STATIC);
1947         }
1948         for(i=0; i<15; i++){
1949             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1950             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1951             init_vlc(&total_zeros_vlc[i],
1952                      TOTAL_ZEROS_VLC_BITS, 16,
1953                      &total_zeros_len [i][0], 1, 1,
1954                      &total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957
1958         for(i=0; i<6; i++){
1959             run_vlc[i].table = run_vlc_tables[i];
1960             run_vlc[i].table_allocated = run_vlc_tables_size;
1961             init_vlc(&run_vlc[i],
1962                      RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1,
1965                      INIT_VLC_USE_NEW_STATIC);
1966         }
1967         run7_vlc.table = run7_vlc_table,
1968         run7_vlc.table_allocated = run7_vlc_table_size;
1969         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1970                  &run_len [6][0], 1, 1,
1971                  &run_bits[6][0], 1, 1,
1972                  INIT_VLC_USE_NEW_STATIC);
1973
1974         init_cavlc_level_tab();
1975     }
1976 }
1977
1978 static void free_tables(H264Context *h){
1979     int i;
1980     H264Context *hx;
1981     av_freep(&h->intra4x4_pred_mode);
1982     av_freep(&h->chroma_pred_mode_table);
1983     av_freep(&h->cbp_table);
1984     av_freep(&h->mvd_table[0]);
1985     av_freep(&h->mvd_table[1]);
1986     av_freep(&h->direct_table);
1987     av_freep(&h->non_zero_count);
1988     av_freep(&h->slice_table_base);
1989     h->slice_table= NULL;
1990
1991     av_freep(&h->mb2b_xy);
1992     av_freep(&h->mb2b8_xy);
1993
1994     for(i = 0; i < MAX_THREADS; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000         av_freep(&hx->rbsp_buffer[1]);
2001         av_freep(&hx->rbsp_buffer[0]);
2002         hx->rbsp_buffer_size[0] = 0;
2003         hx->rbsp_buffer_size[1] = 0;
2004         if (i) av_freep(&h->thread_context[i]);
2005     }
2006 }
2007
2008 static void init_dequant8_coeff_table(H264Context *h){
2009     int i,q,x;
2010     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2011     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2012     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2013
2014     for(i=0; i<2; i++ ){
2015         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2016             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2017             break;
2018         }
2019
2020         for(q=0; q<52; q++){
2021             int shift = div6[q];
2022             int idx = rem6[q];
2023             for(x=0; x<64; x++)
2024                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2025                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2026                     h->pps.scaling_matrix8[i][x]) << shift;
2027         }
2028     }
2029 }
2030
2031 static void init_dequant4_coeff_table(H264Context *h){
2032     int i,j,q,x;
2033     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2034     for(i=0; i<6; i++ ){
2035         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2036         for(j=0; j<i; j++){
2037             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2038                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2039                 break;
2040             }
2041         }
2042         if(j<i)
2043             continue;
2044
2045         for(q=0; q<52; q++){
2046             int shift = div6[q] + 2;
2047             int idx = rem6[q];
2048             for(x=0; x<16; x++)
2049                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2050                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2051                     h->pps.scaling_matrix4[i][x]) << shift;
2052         }
2053     }
2054 }
2055
2056 static void init_dequant_tables(H264Context *h){
2057     int i,x;
2058     init_dequant4_coeff_table(h);
2059     if(h->pps.transform_8x8_mode)
2060         init_dequant8_coeff_table(h);
2061     if(h->sps.transform_bypass){
2062         for(i=0; i<6; i++)
2063             for(x=0; x<16; x++)
2064                 h->dequant4_coeff[i][0][x] = 1<<6;
2065         if(h->pps.transform_8x8_mode)
2066             for(i=0; i<2; i++)
2067                 for(x=0; x<64; x++)
2068                     h->dequant8_coeff[i][0][x] = 1<<6;
2069     }
2070 }
2071
2072
2073 /**
2074  * allocates tables.
2075  * needs width/height
2076  */
2077 static int alloc_tables(H264Context *h){
2078     MpegEncContext * const s = &h->s;
2079     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2080     int x,y;
2081
2082     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t), fail)
2083
2084     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t), fail)
2085     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
2086     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
2087
2088     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
2089     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
2090     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
2091     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
2092
2093     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2094     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2095
2096     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy  , big_mb_num * sizeof(uint32_t), fail);
2097     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b8_xy , big_mb_num * sizeof(uint32_t), fail);
2098     for(y=0; y<s->mb_height; y++){
2099         for(x=0; x<s->mb_width; x++){
2100             const int mb_xy= x + y*s->mb_stride;
2101             const int b_xy = 4*x + 4*y*h->b_stride;
2102             const int b8_xy= 2*x + 2*y*h->b8_stride;
2103
2104             h->mb2b_xy [mb_xy]= b_xy;
2105             h->mb2b8_xy[mb_xy]= b8_xy;
2106         }
2107     }
2108
2109     s->obmc_scratchpad = NULL;
2110
2111     if(!h->dequant4_coeff[0])
2112         init_dequant_tables(h);
2113
2114     return 0;
2115 fail:
2116     free_tables(h);
2117     return -1;
2118 }
2119
2120 /**
2121  * Mimic alloc_tables(), but for every context thread.
2122  */
2123 static void clone_tables(H264Context *dst, H264Context *src){
2124     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2125     dst->non_zero_count           = src->non_zero_count;
2126     dst->slice_table              = src->slice_table;
2127     dst->cbp_table                = src->cbp_table;
2128     dst->mb2b_xy                  = src->mb2b_xy;
2129     dst->mb2b8_xy                 = src->mb2b8_xy;
2130     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2131     dst->mvd_table[0]             = src->mvd_table[0];
2132     dst->mvd_table[1]             = src->mvd_table[1];
2133     dst->direct_table             = src->direct_table;
2134
2135     dst->s.obmc_scratchpad = NULL;
2136     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2137 }
2138
2139 /**
2140  * Init context
2141  * Allocate buffers which are not shared amongst multiple threads.
2142  */
2143 static int context_init(H264Context *h){
2144     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
2145     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
2146
2147     return 0;
2148 fail:
2149     return -1; // free_tables will clean up for us
2150 }
2151
2152 static av_cold void common_init(H264Context *h){
2153     MpegEncContext * const s = &h->s;
2154
2155     s->width = s->avctx->width;
2156     s->height = s->avctx->height;
2157     s->codec_id= s->avctx->codec->id;
2158
2159     ff_h264_pred_init(&h->hpc, s->codec_id);
2160
2161     h->dequant_coeff_pps= -1;
2162     s->unrestricted_mv=1;
2163     s->decode=1; //FIXME
2164
2165     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2166
2167     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2168     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2169 }
2170
2171 /**
2172  * Reset SEI values at the beginning of the frame.
2173  *
2174  * @param h H.264 context.
2175  */
2176 static void reset_sei(H264Context *h) {
2177     h->sei_recovery_frame_cnt       = -1;
2178     h->sei_dpb_output_delay         =  0;
2179     h->sei_cpb_removal_delay        = -1;
2180     h->sei_buffering_period_present =  0;
2181 }
2182
2183 static av_cold int decode_init(AVCodecContext *avctx){
2184     H264Context *h= avctx->priv_data;
2185     MpegEncContext * const s = &h->s;
2186
2187     MPV_decode_defaults(s);
2188
2189     s->avctx = avctx;
2190     common_init(h);
2191
2192     s->out_format = FMT_H264;
2193     s->workaround_bugs= avctx->workaround_bugs;
2194
2195     // set defaults
2196 //    s->decode_mb= ff_h263_decode_mb;
2197     s->quarter_sample = 1;
2198     if(!avctx->has_b_frames)
2199     s->low_delay= 1;
2200
2201     avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2202     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2203     avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
2204
2205     decode_init_vlc();
2206
2207     if(avctx->extradata_size > 0 && avctx->extradata &&
2208        *(char *)avctx->extradata == 1){
2209         h->is_avc = 1;
2210         h->got_avcC = 0;
2211     } else {
2212         h->is_avc = 0;
2213     }
2214
2215     h->thread_context[0] = h;
2216     h->outputed_poc = INT_MIN;
2217     h->prev_poc_msb= 1<<16;
2218     reset_sei(h);
2219     if(avctx->codec_id == CODEC_ID_H264){
2220         if(avctx->ticks_per_frame == 1){
2221             s->avctx->time_base.den *=2;
2222         }
2223         avctx->ticks_per_frame = 2;
2224     }
2225     return 0;
2226 }
2227
2228 static int frame_start(H264Context *h){
2229     MpegEncContext * const s = &h->s;
2230     int i;
2231
2232     if(MPV_frame_start(s, s->avctx) < 0)
2233         return -1;
2234     ff_er_frame_start(s);
2235     /*
2236      * MPV_frame_start uses pict_type to derive key_frame.
2237      * This is incorrect for H.264; IDR markings must be used.
2238      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2239      * See decode_nal_units().
2240      */
2241     s->current_picture_ptr->key_frame= 0;
2242     s->current_picture_ptr->mmco_reset= 0;
2243
2244     assert(s->linesize && s->uvlinesize);
2245
2246     for(i=0; i<16; i++){
2247         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2248         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2249     }
2250     for(i=0; i<4; i++){
2251         h->block_offset[16+i]=
2252         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2253         h->block_offset[24+16+i]=
2254         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2255     }
2256
2257     /* can't be in alloc_tables because linesize isn't known there.
2258      * FIXME: redo bipred weight to not require extra buffer? */
2259     for(i = 0; i < s->avctx->thread_count; i++)
2260         if(!h->thread_context[i]->s.obmc_scratchpad)
2261             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2262
2263     /* some macroblocks will be accessed before they're available */
2264     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2265         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2266
2267 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2268
2269     // We mark the current picture as non-reference after allocating it, so
2270     // that if we break out due to an error it can be released automatically
2271     // in the next MPV_frame_start().
2272     // SVQ3 as well as most other codecs have only last/next/current and thus
2273     // get released even with set reference, besides SVQ3 and others do not
2274     // mark frames as reference later "naturally".
2275     if(s->codec_id != CODEC_ID_SVQ3)
2276         s->current_picture_ptr->reference= 0;
2277
2278     s->current_picture_ptr->field_poc[0]=
2279     s->current_picture_ptr->field_poc[1]= INT_MAX;
2280     assert(s->current_picture_ptr->long_ref==0);
2281
2282     return 0;
2283 }
2284
2285 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2286     MpegEncContext * const s = &h->s;
2287     int i;
2288     int step    = 1;
2289     int offset  = 1;
2290     int uvoffset= 1;
2291     int top_idx = 1;
2292     int skiplast= 0;
2293
2294     src_y  -=   linesize;
2295     src_cb -= uvlinesize;
2296     src_cr -= uvlinesize;
2297
2298     if(!simple && FRAME_MBAFF){
2299         if(s->mb_y&1){
2300             offset  = MB_MBAFF ? 1 : 17;
2301             uvoffset= MB_MBAFF ? 1 : 9;
2302             if(!MB_MBAFF){
2303                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2304                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2305                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2306                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2307                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2308                 }
2309             }
2310         }else{
2311             if(!MB_MBAFF){
2312                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2313                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2314                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2315                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2316                 }
2317                 skiplast= 1;
2318             }
2319             offset  =
2320             uvoffset=
2321             top_idx = MB_MBAFF ? 0 : 1;
2322         }
2323         step= MB_MBAFF ? 2 : 1;
2324     }
2325
2326     // There are two lines saved, the line above the the top macroblock of a pair,
2327     // and the line above the bottom macroblock
2328     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2329     for(i=1; i<17 - skiplast; i++){
2330         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2331     }
2332
2333     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2334     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2335
2336     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2337         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2338         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2339         for(i=1; i<9 - skiplast; i++){
2340             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2341             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2342         }
2343         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2344         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2345     }
2346 }
2347
2348 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2349     MpegEncContext * const s = &h->s;
2350     int temp8, i;
2351     uint64_t temp64;
2352     int deblock_left;
2353     int deblock_top;
2354     int mb_xy;
2355     int step    = 1;
2356     int offset  = 1;
2357     int uvoffset= 1;
2358     int top_idx = 1;
2359
2360     if(!simple && FRAME_MBAFF){
2361         if(s->mb_y&1){
2362             offset  = MB_MBAFF ? 1 : 17;
2363             uvoffset= MB_MBAFF ? 1 : 9;
2364         }else{
2365             offset  =
2366             uvoffset=
2367             top_idx = MB_MBAFF ? 0 : 1;
2368         }
2369         step= MB_MBAFF ? 2 : 1;
2370     }
2371
2372     if(h->deblocking_filter == 2) {
2373         mb_xy = h->mb_xy;
2374         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2375         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2376     } else {
2377         deblock_left = (s->mb_x > 0);
2378         deblock_top =  (s->mb_y > !!MB_FIELD);
2379     }
2380
2381     src_y  -=   linesize + 1;
2382     src_cb -= uvlinesize + 1;
2383     src_cr -= uvlinesize + 1;
2384
2385 #define XCHG(a,b,t,xchg)\
2386 t= a;\
2387 if(xchg)\
2388     a= b;\
2389 b= t;
2390
2391     if(deblock_left){
2392         for(i = !deblock_top; i<16; i++){
2393             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2394         }
2395         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2396     }
2397
2398     if(deblock_top){
2399         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2400         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2401         if(s->mb_x+1 < s->mb_width){
2402             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2403         }
2404     }
2405
2406     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2407         if(deblock_left){
2408             for(i = !deblock_top; i<8; i++){
2409                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2410                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2411             }
2412             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2413             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2414         }
2415         if(deblock_top){
2416             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2417             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2418         }
2419     }
2420 }
2421
2422 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2423     MpegEncContext * const s = &h->s;
2424     const int mb_x= s->mb_x;
2425     const int mb_y= s->mb_y;
2426     const int mb_xy= h->mb_xy;
2427     const int mb_type= s->current_picture.mb_type[mb_xy];
2428     uint8_t  *dest_y, *dest_cb, *dest_cr;
2429     int linesize, uvlinesize /*dct_offset*/;
2430     int i;
2431     int *block_offset = &h->block_offset[0];
2432     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2433     /* is_h264 should always be true if SVQ3 is disabled. */
2434     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2435     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2436     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2437
2438     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2439     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2440     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2441
2442     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2443     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2444
2445     if (!simple && MB_FIELD) {
2446         linesize   = h->mb_linesize   = s->linesize * 2;
2447         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2448         block_offset = &h->block_offset[24];
2449         if(mb_y&1){ //FIXME move out of this function?
2450             dest_y -= s->linesize*15;
2451             dest_cb-= s->uvlinesize*7;
2452             dest_cr-= s->uvlinesize*7;
2453         }
2454         if(FRAME_MBAFF) {
2455             int list;
2456             for(list=0; list<h->list_count; list++){
2457                 if(!USES_LIST(mb_type, list))
2458                     continue;
2459                 if(IS_16X16(mb_type)){
2460                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2461                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2462                 }else{
2463                     for(i=0; i<16; i+=4){
2464                         int ref = h->ref_cache[list][scan8[i]];
2465                         if(ref >= 0)
2466                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2467                     }
2468                 }
2469             }
2470         }
2471     } else {
2472         linesize   = h->mb_linesize   = s->linesize;
2473         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2474 //        dct_offset = s->linesize * 16;
2475     }
2476
2477     if (!simple && IS_INTRA_PCM(mb_type)) {
2478         for (i=0; i<16; i++) {
2479             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2480         }
2481         for (i=0; i<8; i++) {
2482             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2483             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2484         }
2485     } else {
2486         if(IS_INTRA(mb_type)){
2487             if(h->deblocking_filter)
2488                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2489
2490             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2491                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2492                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2493             }
2494
2495             if(IS_INTRA4x4(mb_type)){
2496                 if(simple || !s->encoding){
2497                     if(IS_8x8DCT(mb_type)){
2498                         if(transform_bypass){
2499                             idct_dc_add =
2500                             idct_add    = s->dsp.add_pixels8;
2501                         }else{
2502                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2503                             idct_add    = s->dsp.h264_idct8_add;
2504                         }
2505                         for(i=0; i<16; i+=4){
2506                             uint8_t * const ptr= dest_y + block_offset[i];
2507                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2508                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2509                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2510                             }else{
2511                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2512                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2513                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2514                                 if(nnz){
2515                                     if(nnz == 1 && h->mb[i*16])
2516                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2517                                     else
2518                                         idct_add   (ptr, h->mb + i*16, linesize);
2519                                 }
2520                             }
2521                         }
2522                     }else{
2523                         if(transform_bypass){
2524                             idct_dc_add =
2525                             idct_add    = s->dsp.add_pixels4;
2526                         }else{
2527                             idct_dc_add = s->dsp.h264_idct_dc_add;
2528                             idct_add    = s->dsp.h264_idct_add;
2529                         }
2530                         for(i=0; i<16; i++){
2531                             uint8_t * const ptr= dest_y + block_offset[i];
2532                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2533
2534                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2535                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2536                             }else{
2537                                 uint8_t *topright;
2538                                 int nnz, tr;
2539                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2540                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2541                                     assert(mb_y || linesize <= block_offset[i]);
2542                                     if(!topright_avail){
2543                                         tr= ptr[3 - linesize]*0x01010101;
2544                                         topright= (uint8_t*) &tr;
2545                                     }else
2546                                         topright= ptr + 4 - linesize;
2547                                 }else
2548                                     topright= NULL;
2549
2550                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2551                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2552                                 if(nnz){
2553                                     if(is_h264){
2554                                         if(nnz == 1 && h->mb[i*16])
2555                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2556                                         else
2557                                             idct_add   (ptr, h->mb + i*16, linesize);
2558                                     }else
2559                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2560                                 }
2561                             }
2562                         }
2563                     }
2564                 }
2565             }else{
2566                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2567                 if(is_h264){
2568                     if(!transform_bypass)
2569                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2570                 }else
2571                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2572             }
2573             if(h->deblocking_filter)
2574                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2575         }else if(is_h264){
2576             hl_motion(h, dest_y, dest_cb, dest_cr,
2577                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2578                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2579                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2580         }
2581
2582
2583         if(!IS_INTRA4x4(mb_type)){
2584             if(is_h264){
2585                 if(IS_INTRA16x16(mb_type)){
2586                     if(transform_bypass){
2587                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2588                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2589                         }else{
2590                             for(i=0; i<16; i++){
2591                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2592                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2593                             }
2594                         }
2595                     }else{
2596                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2597                     }
2598                 }else if(h->cbp&15){
2599                     if(transform_bypass){
2600                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2601                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2602                         for(i=0; i<16; i+=di){
2603                             if(h->non_zero_count_cache[ scan8[i] ]){
2604                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2605                             }
2606                         }
2607                     }else{
2608                         if(IS_8x8DCT(mb_type)){
2609                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2610                         }else{
2611                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2612                         }
2613                     }
2614                 }
2615             }else{
2616                 for(i=0; i<16; i++){
2617                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2618                         uint8_t * const ptr= dest_y + block_offset[i];
2619                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2620                     }
2621                 }
2622             }
2623         }
2624
2625         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2626             uint8_t *dest[2] = {dest_cb, dest_cr};
2627             if(transform_bypass){
2628                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2629                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2630                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2631                 }else{
2632                     idct_add = s->dsp.add_pixels4;
2633                     for(i=16; i<16+8; i++){
2634                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2635                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2636                     }
2637                 }
2638             }else{
2639                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2640                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2641                 if(is_h264){
2642                     idct_add = s->dsp.h264_idct_add;
2643                     idct_dc_add = s->dsp.h264_idct_dc_add;
2644                     for(i=16; i<16+8; i++){
2645                         if(h->non_zero_count_cache[ scan8[i] ])
2646                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2647                         else if(h->mb[i*16])
2648                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2649                     }
2650                 }else{
2651                     for(i=16; i<16+8; i++){
2652                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2653                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2654                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2655                         }
2656                     }
2657                 }
2658             }
2659         }
2660     }
2661     if(h->cbp || IS_INTRA(mb_type))
2662         s->dsp.clear_blocks(h->mb);
2663
2664     if(h->deblocking_filter) {
2665         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2666         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2667         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2668         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2669         if (!simple && FRAME_MBAFF) {
2670             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2671         } else {
2672             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2673         }
2674     }
2675 }
2676
2677 /**
2678  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2679  */
2680 static void hl_decode_mb_simple(H264Context *h){
2681     hl_decode_mb_internal(h, 1);
2682 }
2683
2684 /**
2685  * Process a macroblock; this handles edge cases, such as interlacing.
2686  */
2687 static void av_noinline hl_decode_mb_complex(H264Context *h){
2688     hl_decode_mb_internal(h, 0);
2689 }
2690
2691 static void hl_decode_mb(H264Context *h){
2692     MpegEncContext * const s = &h->s;
2693     const int mb_xy= h->mb_xy;
2694     const int mb_type= s->current_picture.mb_type[mb_xy];
2695     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2696
2697     if (is_complex)
2698         hl_decode_mb_complex(h);
2699     else hl_decode_mb_simple(h);
2700 }
2701
2702 static void pic_as_field(Picture *pic, const int parity){
2703     int i;
2704     for (i = 0; i < 4; ++i) {
2705         if (parity == PICT_BOTTOM_FIELD)
2706             pic->data[i] += pic->linesize[i];
2707         pic->reference = parity;
2708         pic->linesize[i] *= 2;
2709     }
2710     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2711 }
2712
2713 static int split_field_copy(Picture *dest, Picture *src,
2714                             int parity, int id_add){
2715     int match = !!(src->reference & parity);
2716
2717     if (match) {
2718         *dest = *src;
2719         if(parity != PICT_FRAME){
2720             pic_as_field(dest, parity);
2721             dest->pic_id *= 2;
2722             dest->pic_id += id_add;
2723         }
2724     }
2725
2726     return match;
2727 }
2728
2729 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2730     int i[2]={0};
2731     int index=0;
2732
2733     while(i[0]<len || i[1]<len){
2734         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2735             i[0]++;
2736         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2737             i[1]++;
2738         if(i[0] < len){
2739             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2740             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2741         }
2742         if(i[1] < len){
2743             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2744             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2745         }
2746     }
2747
2748     return index;
2749 }
2750
2751 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2752     int i, best_poc;
2753     int out_i= 0;
2754
2755     for(;;){
2756         best_poc= dir ? INT_MIN : INT_MAX;
2757
2758         for(i=0; i<len; i++){
2759             const int poc= src[i]->poc;
2760             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2761                 best_poc= poc;
2762                 sorted[out_i]= src[i];
2763             }
2764         }
2765         if(best_poc == (dir ? INT_MIN : INT_MAX))
2766             break;
2767         limit= sorted[out_i++]->poc - dir;
2768     }
2769     return out_i;
2770 }
2771
2772 /**
2773  * fills the default_ref_list.
2774  */
2775 static int fill_default_ref_list(H264Context *h){
2776     MpegEncContext * const s = &h->s;
2777     int i, len;
2778
2779     if(h->slice_type_nos==FF_B_TYPE){
2780         Picture *sorted[32];
2781         int cur_poc, list;
2782         int lens[2];
2783
2784         if(FIELD_PICTURE)
2785             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2786         else
2787             cur_poc= s->current_picture_ptr->poc;
2788
2789         for(list= 0; list<2; list++){
2790             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2791             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2792             assert(len<=32);
2793             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2794             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2795             assert(len<=32);
2796
2797             if(len < h->ref_count[list])
2798                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2799             lens[list]= len;
2800         }
2801
2802         if(lens[0] == lens[1] && lens[1] > 1){
2803             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2804             if(i == lens[0])
2805                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2806         }
2807     }else{
2808         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2809         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2810         assert(len <= 32);
2811         if(len < h->ref_count[0])
2812             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2813     }
2814 #ifdef TRACE
2815     for (i=0; i<h->ref_count[0]; i++) {
2816         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2817     }
2818     if(h->slice_type_nos==FF_B_TYPE){
2819         for (i=0; i<h->ref_count[1]; i++) {
2820             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2821         }
2822     }
2823 #endif
2824     return 0;
2825 }
2826
2827 static void print_short_term(H264Context *h);
2828 static void print_long_term(H264Context *h);
2829
2830 /**
2831  * Extract structure information about the picture described by pic_num in
2832  * the current decoding context (frame or field). Note that pic_num is
2833  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2834  * @param pic_num picture number for which to extract structure information
2835  * @param structure one of PICT_XXX describing structure of picture
2836  *                      with pic_num
2837  * @return frame number (short term) or long term index of picture
2838  *         described by pic_num
2839  */
2840 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2841     MpegEncContext * const s = &h->s;
2842
2843     *structure = s->picture_structure;
2844     if(FIELD_PICTURE){
2845         if (!(pic_num & 1))
2846             /* opposite field */
2847             *structure ^= PICT_FRAME;
2848         pic_num >>= 1;
2849     }
2850
2851     return pic_num;
2852 }
2853
2854 static int decode_ref_pic_list_reordering(H264Context *h){
2855     MpegEncContext * const s = &h->s;
2856     int list, index, pic_structure;
2857
2858     print_short_term(h);
2859     print_long_term(h);
2860
2861     for(list=0; list<h->list_count; list++){
2862         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2863
2864         if(get_bits1(&s->gb)){
2865             int pred= h->curr_pic_num;
2866
2867             for(index=0; ; index++){
2868                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2869                 unsigned int pic_id;
2870                 int i;
2871                 Picture *ref = NULL;
2872
2873                 if(reordering_of_pic_nums_idc==3)
2874                     break;
2875
2876                 if(index >= h->ref_count[list]){
2877                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2878                     return -1;
2879                 }
2880
2881                 if(reordering_of_pic_nums_idc<3){
2882                     if(reordering_of_pic_nums_idc<2){
2883                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2884                         int frame_num;
2885
2886                         if(abs_diff_pic_num > h->max_pic_num){
2887                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2888                             return -1;
2889                         }
2890
2891                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2892                         else                                pred+= abs_diff_pic_num;
2893                         pred &= h->max_pic_num - 1;
2894
2895                         frame_num = pic_num_extract(h, pred, &pic_structure);
2896
2897                         for(i= h->short_ref_count-1; i>=0; i--){
2898                             ref = h->short_ref[i];
2899                             assert(ref->reference);
2900                             assert(!ref->long_ref);
2901                             if(
2902                                    ref->frame_num == frame_num &&
2903                                    (ref->reference & pic_structure)
2904                               )
2905                                 break;
2906                         }
2907                         if(i>=0)
2908                             ref->pic_id= pred;
2909                     }else{
2910                         int long_idx;
2911                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2912
2913                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2914
2915                         if(long_idx>31){
2916                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2917                             return -1;
2918                         }
2919                         ref = h->long_ref[long_idx];
2920                         assert(!(ref && !ref->reference));
2921                         if(ref && (ref->reference & pic_structure)){
2922                             ref->pic_id= pic_id;
2923                             assert(ref->long_ref);
2924                             i=0;
2925                         }else{
2926                             i=-1;
2927                         }
2928                     }
2929
2930                     if (i < 0) {
2931                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2932                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2933                     } else {
2934                         for(i=index; i+1<h->ref_count[list]; i++){
2935                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2936                                 break;
2937                         }
2938                         for(; i > index; i--){
2939                             h->ref_list[list][i]= h->ref_list[list][i-1];
2940                         }
2941                         h->ref_list[list][index]= *ref;
2942                         if (FIELD_PICTURE){
2943                             pic_as_field(&h->ref_list[list][index], pic_structure);
2944                         }
2945                     }
2946                 }else{
2947                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2948                     return -1;
2949                 }
2950             }
2951         }
2952     }
2953     for(list=0; list<h->list_count; list++){
2954         for(index= 0; index < h->ref_count[list]; index++){
2955             if(!h->ref_list[list][index].data[0]){
2956                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2957                 if(h->default_ref_list[list][0].data[0])
2958                     h->ref_list[list][index]= h->default_ref_list[list][0];
2959                 else
2960                     return -1;
2961             }
2962         }
2963     }
2964
2965     return 0;
2966 }
2967
2968 static void fill_mbaff_ref_list(H264Context *h){
2969     int list, i, j;
2970     for(list=0; list<2; list++){ //FIXME try list_count
2971         for(i=0; i<h->ref_count[list]; i++){
2972             Picture *frame = &h->ref_list[list][i];
2973             Picture *field = &h->ref_list[list][16+2*i];
2974             field[0] = *frame;
2975             for(j=0; j<3; j++)
2976                 field[0].linesize[j] <<= 1;
2977             field[0].reference = PICT_TOP_FIELD;
2978             field[0].poc= field[0].field_poc[0];
2979             field[1] = field[0];
2980             for(j=0; j<3; j++)
2981                 field[1].data[j] += frame->linesize[j];
2982             field[1].reference = PICT_BOTTOM_FIELD;
2983             field[1].poc= field[1].field_poc[1];
2984
2985             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2986             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2987             for(j=0; j<2; j++){
2988                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2989                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2990             }
2991         }
2992     }
2993     for(j=0; j<h->ref_count[1]; j++){
2994         for(i=0; i<h->ref_count[0]; i++)
2995             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2996         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2997         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2998     }
2999 }
3000
3001 static int pred_weight_table(H264Context *h){
3002     MpegEncContext * const s = &h->s;
3003     int list, i;
3004     int luma_def, chroma_def;
3005
3006     h->use_weight= 0;
3007     h->use_weight_chroma= 0;
3008     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3009     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3010     luma_def = 1<<h->luma_log2_weight_denom;
3011     chroma_def = 1<<h->chroma_log2_weight_denom;
3012
3013     for(list=0; list<2; list++){
3014         h->luma_weight_flag[list]   = 0;
3015         h->chroma_weight_flag[list] = 0;
3016         for(i=0; i<h->ref_count[list]; i++){
3017             int luma_weight_flag, chroma_weight_flag;
3018
3019             luma_weight_flag= get_bits1(&s->gb);
3020             if(luma_weight_flag){
3021                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3022                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3023                 if(   h->luma_weight[list][i] != luma_def
3024                    || h->luma_offset[list][i] != 0) {
3025                     h->use_weight= 1;
3026                     h->luma_weight_flag[list]= 1;
3027                 }
3028             }else{
3029                 h->luma_weight[list][i]= luma_def;
3030                 h->luma_offset[list][i]= 0;
3031             }
3032
3033             if(CHROMA){
3034                 chroma_weight_flag= get_bits1(&s->gb);
3035                 if(chroma_weight_flag){
3036                     int j;
3037                     for(j=0; j<2; j++){
3038                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3039                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3040                         if(   h->chroma_weight[list][i][j] != chroma_def
3041                            || h->chroma_offset[list][i][j] != 0) {
3042                             h->use_weight_chroma= 1;
3043                             h->chroma_weight_flag[list]= 1;
3044                         }
3045                     }
3046                 }else{
3047                     int j;
3048                     for(j=0; j<2; j++){
3049                         h->chroma_weight[list][i][j]= chroma_def;
3050                         h->chroma_offset[list][i][j]= 0;
3051                     }
3052                 }
3053             }
3054         }
3055         if(h->slice_type_nos != FF_B_TYPE) break;
3056     }
3057     h->use_weight= h->use_weight || h->use_weight_chroma;
3058     return 0;
3059 }
3060
3061 static void implicit_weight_table(H264Context *h){
3062     MpegEncContext * const s = &h->s;
3063     int ref0, ref1, i;
3064     int cur_poc = s->current_picture_ptr->poc;
3065
3066     for (i = 0; i < 2; i++) {
3067         h->luma_weight_flag[i]   = 0;
3068         h->chroma_weight_flag[i] = 0;
3069     }
3070
3071     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3072        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3073         h->use_weight= 0;
3074         h->use_weight_chroma= 0;
3075         return;
3076     }
3077
3078     h->use_weight= 2;
3079     h->use_weight_chroma= 2;
3080     h->luma_log2_weight_denom= 5;
3081     h->chroma_log2_weight_denom= 5;
3082
3083     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3084         int poc0 = h->ref_list[0][ref0].poc;
3085         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3086             int poc1 = h->ref_list[1][ref1].poc;
3087             int td = av_clip(poc1 - poc0, -128, 127);
3088             if(td){
3089                 int tb = av_clip(cur_poc - poc0, -128, 127);
3090                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3091                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3092                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3093                     h->implicit_weight[ref0][ref1] = 32;
3094                 else
3095                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3096             }else
3097                 h->implicit_weight[ref0][ref1] = 32;
3098         }
3099     }
3100 }
3101
3102 /**
3103  * Mark a picture as no longer needed for reference. The refmask
3104  * argument allows unreferencing of individual fields or the whole frame.
3105  * If the picture becomes entirely unreferenced, but is being held for
3106  * display purposes, it is marked as such.
3107  * @param refmask mask of fields to unreference; the mask is bitwise
3108  *                anded with the reference marking of pic
3109  * @return non-zero if pic becomes entirely unreferenced (except possibly
3110  *         for display purposes) zero if one of the fields remains in
3111  *         reference
3112  */
3113 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3114     int i;
3115     if (pic->reference &= refmask) {
3116         return 0;
3117     } else {
3118         for(i = 0; h->delayed_pic[i]; i++)
3119             if(pic == h->delayed_pic[i]){
3120                 pic->reference=DELAYED_PIC_REF;
3121                 break;
3122             }
3123         return 1;
3124     }
3125 }
3126
3127 /**
3128  * instantaneous decoder refresh.
3129  */
3130 static void idr(H264Context *h){
3131     int i;
3132
3133     for(i=0; i<16; i++){
3134         remove_long(h, i, 0);
3135     }
3136     assert(h->long_ref_count==0);
3137
3138     for(i=0; i<h->short_ref_count; i++){
3139         unreference_pic(h, h->short_ref[i], 0);
3140         h->short_ref[i]= NULL;
3141     }
3142     h->short_ref_count=0;
3143     h->prev_frame_num= 0;
3144     h->prev_frame_num_offset= 0;
3145     h->prev_poc_msb=
3146     h->prev_poc_lsb= 0;
3147 }
3148
3149 /* forget old pics after a seek */
3150 static void flush_dpb(AVCodecContext *avctx){
3151     H264Context *h= avctx->priv_data;
3152     int i;
3153     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3154         if(h->delayed_pic[i])
3155             h->delayed_pic[i]->reference= 0;
3156         h->delayed_pic[i]= NULL;
3157     }
3158     h->outputed_poc= INT_MIN;
3159     h->prev_interlaced_frame = 1;
3160     idr(h);
3161     if(h->s.current_picture_ptr)
3162         h->s.current_picture_ptr->reference= 0;
3163     h->s.first_field= 0;
3164     reset_sei(h);
3165     ff_mpeg_flush(avctx);
3166 }
3167
3168 /**
3169  * Find a Picture in the short term reference list by frame number.
3170  * @param frame_num frame number to search for
3171  * @param idx the index into h->short_ref where returned picture is found
3172  *            undefined if no picture found.
3173  * @return pointer to the found picture, or NULL if no pic with the provided
3174  *                 frame number is found
3175  */
3176 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3177     MpegEncContext * const s = &h->s;
3178     int i;
3179
3180     for(i=0; i<h->short_ref_count; i++){
3181         Picture *pic= h->short_ref[i];
3182         if(s->avctx->debug&FF_DEBUG_MMCO)
3183             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3184         if(pic->frame_num == frame_num) {
3185             *idx = i;
3186             return pic;
3187         }
3188     }
3189     return NULL;
3190 }
3191
3192 /**
3193  * Remove a picture from the short term reference list by its index in
3194  * that list.  This does no checking on the provided index; it is assumed
3195  * to be valid. Other list entries are shifted down.
3196  * @param i index into h->short_ref of picture to remove.
3197  */
3198 static void remove_short_at_index(H264Context *h, int i){
3199     assert(i >= 0 && i < h->short_ref_count);
3200     h->short_ref[i]= NULL;
3201     if (--h->short_ref_count)
3202         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3203 }
3204
3205 /**
3206  *
3207  * @return the removed picture or NULL if an error occurs
3208  */
3209 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3210     MpegEncContext * const s = &h->s;
3211     Picture *pic;
3212     int i;
3213
3214     if(s->avctx->debug&FF_DEBUG_MMCO)
3215         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3216
3217     pic = find_short(h, frame_num, &i);
3218     if (pic){
3219         if(unreference_pic(h, pic, ref_mask))
3220         remove_short_at_index(h, i);
3221     }
3222
3223     return pic;
3224 }
3225
3226 /**
3227  * Remove a picture from the long term reference list by its index in
3228  * that list.
3229  * @return the removed picture or NULL if an error occurs
3230  */
3231 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3232     Picture *pic;
3233
3234     pic= h->long_ref[i];
3235     if (pic){
3236         if(unreference_pic(h, pic, ref_mask)){
3237             assert(h->long_ref[i]->long_ref == 1);
3238             h->long_ref[i]->long_ref= 0;
3239             h->long_ref[i]= NULL;
3240             h->long_ref_count--;
3241         }
3242     }
3243
3244     return pic;
3245 }
3246
3247 /**
3248  * print short term list
3249  */
3250 static void print_short_term(H264Context *h) {
3251     uint32_t i;
3252     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3253         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3254         for(i=0; i<h->short_ref_count; i++){
3255             Picture *pic= h->short_ref[i];
3256             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3257         }
3258     }
3259 }
3260
3261 /**
3262  * print long term list
3263  */
3264 static void print_long_term(H264Context *h) {
3265     uint32_t i;
3266     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3267         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3268         for(i = 0; i < 16; i++){
3269             Picture *pic= h->long_ref[i];
3270             if (pic) {
3271                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3272             }
3273         }
3274     }
3275 }
3276
3277 /**
3278  * Executes the reference picture marking (memory management control operations).
3279  */
3280 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3281     MpegEncContext * const s = &h->s;
3282     int i, av_uninit(j);
3283     int current_ref_assigned=0;
3284     Picture *av_uninit(pic);
3285
3286     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3287         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3288
3289     for(i=0; i<mmco_count; i++){
3290         int av_uninit(structure), av_uninit(frame_num);
3291         if(s->avctx->debug&FF_DEBUG_MMCO)
3292             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3293
3294         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3295            || mmco[i].opcode == MMCO_SHORT2LONG){
3296             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3297             pic = find_short(h, frame_num, &j);
3298             if(!pic){
3299                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3300                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3301                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3302                 continue;
3303             }
3304         }
3305
3306         switch(mmco[i].opcode){
3307         case MMCO_SHORT2UNUSED:
3308             if(s->avctx->debug&FF_DEBUG_MMCO)
3309                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3310             remove_short(h, frame_num, structure ^ PICT_FRAME);
3311             break;
3312         case MMCO_SHORT2LONG:
3313                 if (h->long_ref[mmco[i].long_arg] != pic)
3314                     remove_long(h, mmco[i].long_arg, 0);
3315
3316                 remove_short_at_index(h, j);
3317                 h->long_ref[ mmco[i].long_arg ]= pic;
3318                 if (h->long_ref[ mmco[i].long_arg ]){
3319                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3320                     h->long_ref_count++;
3321                 }
3322             break;
3323         case MMCO_LONG2UNUSED:
3324             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3325             pic = h->long_ref[j];
3326             if (pic) {
3327                 remove_long(h, j, structure ^ PICT_FRAME);
3328             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3329                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3330             break;
3331         case MMCO_LONG:
3332                     // Comment below left from previous code as it is an interresting note.
3333                     /* First field in pair is in short term list or
3334                      * at a different long term index.
3335                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3336                      * Report the problem and keep the pair where it is,
3337                      * and mark this field valid.
3338                      */
3339
3340             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3341                 remove_long(h, mmco[i].long_arg, 0);
3342
3343                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3344                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3345                 h->long_ref_count++;
3346             }
3347
3348             s->current_picture_ptr->reference |= s->picture_structure;
3349             current_ref_assigned=1;
3350             break;
3351         case MMCO_SET_MAX_LONG:
3352             assert(mmco[i].long_arg <= 16);
3353             // just remove the long term which index is greater than new max
3354             for(j = mmco[i].long_arg; j<16; j++){
3355                 remove_long(h, j, 0);
3356             }
3357             break;
3358         case MMCO_RESET:
3359             while(h->short_ref_count){
3360                 remove_short(h, h->short_ref[0]->frame_num, 0);
3361             }
3362             for(j = 0; j < 16; j++) {
3363                 remove_long(h, j, 0);
3364             }
3365             s->current_picture_ptr->poc=
3366             s->current_picture_ptr->field_poc[0]=
3367             s->current_picture_ptr->field_poc[1]=
3368             h->poc_lsb=
3369             h->poc_msb=
3370             h->frame_num=
3371             s->current_picture_ptr->frame_num= 0;
3372             s->current_picture_ptr->mmco_reset=1;
3373             break;
3374         default: assert(0);
3375         }
3376     }
3377
3378     if (!current_ref_assigned) {
3379         /* Second field of complementary field pair; the first field of
3380          * which is already referenced. If short referenced, it
3381          * should be first entry in short_ref. If not, it must exist
3382          * in long_ref; trying to put it on the short list here is an
3383          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3384          */
3385         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3386             /* Just mark the second field valid */
3387             s->current_picture_ptr->reference = PICT_FRAME;
3388         } else if (s->current_picture_ptr->long_ref) {
3389             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3390                                              "assignment for second field "
3391                                              "in complementary field pair "
3392                                              "(first field is long term)\n");
3393         } else {
3394             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3395             if(pic){
3396                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3397             }
3398
3399             if(h->short_ref_count)
3400                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3401
3402             h->short_ref[0]= s->current_picture_ptr;
3403             h->short_ref_count++;
3404             s->current_picture_ptr->reference |= s->picture_structure;
3405         }
3406     }
3407
3408     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3409
3410         /* We have too many reference frames, probably due to corrupted
3411          * stream. Need to discard one frame. Prevents overrun of the
3412          * short_ref and long_ref buffers.
3413          */
3414         av_log(h->s.avctx, AV_LOG_ERROR,
3415                "number of reference frames exceeds max (probably "
3416                "corrupt input), discarding one\n");
3417
3418         if (h->long_ref_count && !h->short_ref_count) {
3419             for (i = 0; i < 16; ++i)
3420                 if (h->long_ref[i])
3421                     break;
3422
3423             assert(i < 16);
3424             remove_long(h, i, 0);
3425         } else {
3426             pic = h->short_ref[h->short_ref_count - 1];
3427             remove_short(h, pic->frame_num, 0);
3428         }
3429     }
3430
3431     print_short_term(h);
3432     print_long_term(h);
3433     return 0;
3434 }
3435
3436 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3437     MpegEncContext * const s = &h->s;
3438     int i;
3439
3440     h->mmco_index= 0;
3441     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3442         s->broken_link= get_bits1(gb) -1;
3443         if(get_bits1(gb)){
3444             h->mmco[0].opcode= MMCO_LONG;
3445             h->mmco[0].long_arg= 0;
3446             h->mmco_index= 1;
3447         }
3448     }else{
3449         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3450             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3451                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3452
3453                 h->mmco[i].opcode= opcode;
3454                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3455                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3456 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3457                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3458                         return -1;
3459                     }*/
3460                 }
3461                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3462                     unsigned int long_arg= get_ue_golomb_31(gb);
3463                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3464                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3465                         return -1;
3466                     }
3467                     h->mmco[i].long_arg= long_arg;
3468                 }
3469
3470                 if(opcode > (unsigned)MMCO_LONG){
3471                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3472                     return -1;
3473                 }
3474                 if(opcode == MMCO_END)
3475                     break;
3476             }
3477             h->mmco_index= i;
3478         }else{
3479             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3480
3481             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3482                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3483                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3484                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3485                 h->mmco_index= 1;
3486                 if (FIELD_PICTURE) {
3487                     h->mmco[0].short_pic_num *= 2;
3488                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3489                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3490                     h->mmco_index= 2;
3491                 }
3492             }
3493         }
3494     }
3495
3496     return 0;
3497 }
3498
3499 static int init_poc(H264Context *h){
3500     MpegEncContext * const s = &h->s;
3501     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3502     int field_poc[2];
3503     Picture *cur = s->current_picture_ptr;
3504
3505     h->frame_num_offset= h->prev_frame_num_offset;
3506     if(h->frame_num < h->prev_frame_num)
3507         h->frame_num_offset += max_frame_num;
3508
3509     if(h->sps.poc_type==0){
3510         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3511
3512         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3513             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3514         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3515             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3516         else
3517             h->poc_msb = h->prev_poc_msb;
3518 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3519         field_poc[0] =
3520         field_poc[1] = h->poc_msb + h->poc_lsb;
3521         if(s->picture_structure == PICT_FRAME)
3522             field_poc[1] += h->delta_poc_bottom;
3523     }else if(h->sps.poc_type==1){
3524         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3525         int i;
3526
3527         if(h->sps.poc_cycle_length != 0)
3528             abs_frame_num = h->frame_num_offset + h->frame_num;
3529         else
3530             abs_frame_num = 0;
3531
3532         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3533             abs_frame_num--;
3534
3535         expected_delta_per_poc_cycle = 0;
3536         for(i=0; i < h->sps.poc_cycle_length; i++)
3537             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3538
3539         if(abs_frame_num > 0){
3540             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3541             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3542
3543             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3544             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3545                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3546         } else
3547             expectedpoc = 0;
3548
3549         if(h->nal_ref_idc == 0)
3550             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3551
3552         field_poc[0] = expectedpoc + h->delta_poc[0];
3553         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3554
3555         if(s->picture_structure == PICT_FRAME)
3556             field_poc[1] += h->delta_poc[1];
3557     }else{
3558         int poc= 2*(h->frame_num_offset + h->frame_num);
3559
3560         if(!h->nal_ref_idc)
3561             poc--;
3562
3563         field_poc[0]= poc;
3564         field_poc[1]= poc;
3565     }
3566
3567     if(s->picture_structure != PICT_BOTTOM_FIELD)
3568         s->current_picture_ptr->field_poc[0]= field_poc[0];
3569     if(s->picture_structure != PICT_TOP_FIELD)
3570         s->current_picture_ptr->field_poc[1]= field_poc[1];
3571     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3572
3573     return 0;
3574 }
3575
3576
3577 /**
3578  * initialize scan tables
3579  */
3580 static void init_scan_tables(H264Context *h){
3581     MpegEncContext * const s = &h->s;
3582     int i;
3583     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3584         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3585         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3586     }else{
3587         for(i=0; i<16; i++){
3588 #define T(x) (x>>2) | ((x<<2) & 0xF)
3589             h->zigzag_scan[i] = T(zigzag_scan[i]);
3590             h-> field_scan[i] = T( field_scan[i]);
3591 #undef T
3592         }
3593     }
3594     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3595         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3596         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3597         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3598         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3599     }else{
3600         for(i=0; i<64; i++){
3601 #define T(x) (x>>3) | ((x&7)<<3)
3602             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3603             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3604             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3605             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3606 #undef T
3607         }
3608     }
3609     if(h->sps.transform_bypass){ //FIXME same ugly
3610         h->zigzag_scan_q0          = zigzag_scan;
3611         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3612         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3613         h->field_scan_q0           = field_scan;
3614         h->field_scan8x8_q0        = field_scan8x8;
3615         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3616     }else{
3617         h->zigzag_scan_q0          = h->zigzag_scan;
3618         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3619         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3620         h->field_scan_q0           = h->field_scan;
3621         h->field_scan8x8_q0        = h->field_scan8x8;
3622         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3623     }
3624 }
3625
3626 static void field_end(H264Context *h){
3627     MpegEncContext * const s = &h->s;
3628     AVCodecContext * const avctx= s->avctx;
3629     s->mb_y= 0;
3630
3631     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
3632     s->current_picture_ptr->pict_type= s->pict_type;
3633
3634     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3635         ff_vdpau_h264_set_reference_frames(s);
3636
3637     if(!s->dropable) {
3638         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
3639         h->prev_poc_msb= h->poc_msb;
3640         h->prev_poc_lsb= h->poc_lsb;
3641     }
3642     h->prev_frame_num_offset= h->frame_num_offset;
3643     h->prev_frame_num= h->frame_num;
3644
3645     if (avctx->hwaccel) {
3646         if (avctx->hwaccel->end_frame(avctx) < 0)
3647             av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
3648     }
3649
3650     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3651         ff_vdpau_h264_picture_complete(s);
3652
3653     /*
3654      * FIXME: Error handling code does not seem to support interlaced
3655      * when slices span multiple rows
3656      * The ff_er_add_slice calls don't work right for bottom
3657      * fields; they cause massive erroneous error concealing
3658      * Error marking covers both fields (top and bottom).
3659      * This causes a mismatched s->error_count
3660      * and a bad error table. Further, the error count goes to
3661      * INT_MAX when called for bottom field, because mb_y is
3662      * past end by one (callers fault) and resync_mb_y != 0
3663      * causes problems for the first MB line, too.
3664      */
3665     if (!FIELD_PICTURE)
3666         ff_er_frame_end(s);
3667
3668     MPV_frame_end(s);
3669
3670     h->current_slice=0;
3671 }
3672
3673 /**
3674  * Replicates H264 "master" context to thread contexts.
3675  */
3676 static void clone_slice(H264Context *dst, H264Context *src)
3677 {
3678     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3679     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3680     dst->s.current_picture      = src->s.current_picture;
3681     dst->s.linesize             = src->s.linesize;
3682     dst->s.uvlinesize           = src->s.uvlinesize;
3683     dst->s.first_field          = src->s.first_field;
3684
3685     dst->prev_poc_msb           = src->prev_poc_msb;
3686     dst->prev_poc_lsb           = src->prev_poc_lsb;
3687     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3688     dst->prev_frame_num         = src->prev_frame_num;
3689     dst->short_ref_count        = src->short_ref_count;
3690
3691     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3692     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3693     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3694     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3695
3696     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3697     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3698 }
3699
3700 /**
3701  * decodes a slice header.
3702  * This will also call MPV_common_init() and frame_start() as needed.
3703  *
3704  * @param h h264context
3705  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3706  *
3707  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3708  */
3709 static int decode_slice_header(H264Context *h, H264Context *h0){
3710     MpegEncContext * const s = &h->s;
3711     MpegEncContext * const s0 = &h0->s;
3712     unsigned int first_mb_in_slice;
3713     unsigned int pps_id;
3714     int num_ref_idx_active_override_flag;
3715     unsigned int slice_type, tmp, i, j;
3716     int default_ref_list_done = 0;
3717     int last_pic_structure;
3718
3719     s->dropable= h->nal_ref_idc == 0;
3720
3721     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3722         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3723         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3724     }else{
3725         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3726         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3727     }
3728
3729     first_mb_in_slice= get_ue_golomb(&s->gb);
3730
3731     if(first_mb_in_slice == 0){ //FIXME better field boundary detection
3732         if(h0->current_slice && FIELD_PICTURE){
3733             field_end(h);
3734         }
3735
3736         h0->current_slice = 0;
3737         if (!s0->first_field)
3738             s->current_picture_ptr= NULL;
3739     }
3740
3741     slice_type= get_ue_golomb_31(&s->gb);
3742     if(slice_type > 9){
3743         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3744         return -1;
3745     }
3746     if(slice_type > 4){
3747         slice_type -= 5;
3748         h->slice_type_fixed=1;
3749     }else
3750         h->slice_type_fixed=0;
3751
3752     slice_type= golomb_to_pict_type[ slice_type ];
3753     if (slice_type == FF_I_TYPE
3754         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3755         default_ref_list_done = 1;
3756     }
3757     h->slice_type= slice_type;
3758     h->slice_type_nos= slice_type & 3;
3759
3760     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3761     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3762         av_log(h->s.avctx, AV_LOG_ERROR,
3763                "B picture before any references, skipping\n");
3764         return -1;
3765     }
3766
3767     pps_id= get_ue_golomb(&s->gb);
3768     if(pps_id>=MAX_PPS_COUNT){
3769         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3770         return -1;
3771     }
3772     if(!h0->pps_buffers[pps_id]) {
3773         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
3774         return -1;
3775     }
3776     h->pps= *h0->pps_buffers[pps_id];
3777
3778     if(!h0->sps_buffers[h->pps.sps_id]) {
3779         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
3780         return -1;
3781     }
3782     h->sps = *h0->sps_buffers[h->pps.sps_id];
3783
3784     if(h == h0 && h->dequant_coeff_pps != pps_id){
3785         h->dequant_coeff_pps = pps_id;
3786         init_dequant_tables(h);
3787     }
3788
3789     s->mb_width= h->sps.mb_width;
3790     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3791
3792     h->b_stride=  s->mb_width*4;
3793     h->b8_stride= s->mb_width*2;
3794
3795     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3796     if(h->sps.frame_mbs_only_flag)
3797         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3798     else
3799         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3800
3801     if (s->context_initialized
3802         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3803         if(h != h0)
3804             return -1;   // width / height changed during parallelized decoding
3805         free_tables(h);
3806         flush_dpb(s->avctx);
3807         MPV_common_end(s);
3808     }
3809     if (!s->context_initialized) {
3810         if(h != h0)
3811             return -1;  // we cant (re-)initialize context during parallel decoding
3812         if (MPV_common_init(s) < 0)
3813             return -1;
3814         s->first_field = 0;
3815         h->prev_interlaced_frame = 1;
3816
3817         init_scan_tables(h);
3818         alloc_tables(h);
3819
3820         for(i = 1; i < s->avctx->thread_count; i++) {
3821             H264Context *c;
3822             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3823             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3824             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3825             c->sps = h->sps;
3826             c->pps = h->pps;
3827             init_scan_tables(c);
3828             clone_tables(c, h);
3829         }
3830
3831         for(i = 0; i < s->avctx->thread_count; i++)
3832             if(context_init(h->thread_context[i]) < 0)
3833                 return -1;
3834
3835         s->avctx->width = s->width;
3836         s->avctx->height = s->height;
3837         s->avctx->sample_aspect_ratio= h->sps.sar;
3838         if(!s->avctx->sample_aspect_ratio.den)
3839             s->avctx->sample_aspect_ratio.den = 1;
3840
3841         if(h->sps.timing_info_present_flag){
3842             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3843             if(h->x264_build > 0 && h->x264_build < 44)
3844                 s->avctx->time_base.den *= 2;
3845             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3846                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3847         }
3848     }
3849
3850     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3851
3852     h->mb_mbaff = 0;
3853     h->mb_aff_frame = 0;
3854     last_pic_structure = s0->picture_structure;
3855     if(h->sps.frame_mbs_only_flag){
3856         s->picture_structure= PICT_FRAME;
3857     }else{
3858         if(get_bits1(&s->gb)) { //field_pic_flag
3859             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3860         } else {
3861             s->picture_structure= PICT_FRAME;
3862             h->mb_aff_frame = h->sps.mb_aff;
3863         }
3864     }
3865     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3866
3867     if(h0->current_slice == 0){
3868         while(h->frame_num !=  h->prev_frame_num &&
3869               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3870             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3871             if (frame_start(h) < 0)
3872                 return -1;
3873             h->prev_frame_num++;
3874             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3875             s->current_picture_ptr->frame_num= h->prev_frame_num;
3876             execute_ref_pic_marking(h, NULL, 0);
3877         }
3878
3879         /* See if we have a decoded first field looking for a pair... */
3880         if (s0->first_field) {
3881             assert(s0->current_picture_ptr);
3882             assert(s0->current_picture_ptr->data[0]);
3883             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3884
3885             /* figure out if we have a complementary field pair */
3886             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3887                 /*
3888                  * Previous field is unmatched. Don't display it, but let it
3889                  * remain for reference if marked as such.
3890                  */
3891                 s0->current_picture_ptr = NULL;
3892                 s0->first_field = FIELD_PICTURE;
3893
3894             } else {
3895                 if (h->nal_ref_idc &&
3896                         s0->current_picture_ptr->reference &&
3897                         s0->current_picture_ptr->frame_num != h->frame_num) {
3898                     /*
3899                      * This and previous field were reference, but had
3900                      * different frame_nums. Consider this field first in
3901                      * pair. Throw away previous field except for reference
3902                      * purposes.
3903                      */
3904                     s0->first_field = 1;
3905                     s0->current_picture_ptr = NULL;
3906
3907                 } else {
3908                     /* Second field in complementary pair */
3909                     s0->first_field = 0;
3910                 }
3911             }
3912
3913         } else {
3914             /* Frame or first field in a potentially complementary pair */
3915             assert(!s0->current_picture_ptr);
3916             s0->first_field = FIELD_PICTURE;
3917         }
3918
3919         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3920             s0->first_field = 0;
3921             return -1;
3922         }
3923     }
3924     if(h != h0)
3925         clone_slice(h, h0);
3926
3927     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3928
3929     assert(s->mb_num == s->mb_width * s->mb_height);
3930     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3931        first_mb_in_slice                    >= s->mb_num){
3932         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3933         return -1;
3934     }
3935     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3936     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3937     if (s->picture_structure == PICT_BOTTOM_FIELD)
3938         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3939     assert(s->mb_y < s->mb_height);
3940
3941     if(s->picture_structure==PICT_FRAME){
3942         h->curr_pic_num=   h->frame_num;
3943         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3944     }else{
3945         h->curr_pic_num= 2*h->frame_num + 1;
3946         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3947     }
3948
3949     if(h->nal_unit_type == NAL_IDR_SLICE){
3950         get_ue_golomb(&s->gb); /* idr_pic_id */
3951     }
3952
3953     if(h->sps.poc_type==0){
3954         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3955
3956         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3957             h->delta_poc_bottom= get_se_golomb(&s->gb);
3958         }
3959     }
3960
3961     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3962         h->delta_poc[0]= get_se_golomb(&s->gb);
3963
3964         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3965             h->delta_poc[1]= get_se_golomb(&s->gb);
3966     }
3967
3968     init_poc(h);
3969
3970     if(h->pps.redundant_pic_cnt_present){
3971         h->redundant_pic_count= get_ue_golomb(&s->gb);
3972     }
3973
3974     //set defaults, might be overridden a few lines later
3975     h->ref_count[0]= h->pps.ref_count[0];
3976     h->ref_count[1]= h->pps.ref_count[1];
3977
3978     if(h->slice_type_nos != FF_I_TYPE){
3979         if(h->slice_type_nos == FF_B_TYPE){
3980             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3981         }
3982         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3983
3984         if(num_ref_idx_active_override_flag){
3985             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3986             if(h->slice_type_nos==FF_B_TYPE)
3987                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3988
3989             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3990                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3991                 h->ref_count[0]= h->ref_count[1]= 1;
3992                 return -1;
3993             }
3994         }
3995         if(h->slice_type_nos == FF_B_TYPE)
3996             h->list_count= 2;
3997         else
3998             h->list_count= 1;
3999     }else
4000         h->list_count= 0;
4001
4002     if(!default_ref_list_done){
4003         fill_default_ref_list(h);
4004     }
4005
4006     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
4007         return -1;
4008
4009     if(h->slice_type_nos!=FF_I_TYPE){
4010         s->last_picture_ptr= &h->ref_list[0][0];
4011         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
4012     }
4013     if(h->slice_type_nos==FF_B_TYPE){
4014         s->next_picture_ptr= &h->ref_list[1][0];
4015         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
4016     }
4017
4018     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4019        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4020         pred_weight_table(h);
4021     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4022         implicit_weight_table(h);
4023     else {
4024         h->use_weight = 0;
4025         for (i = 0; i < 2; i++) {
4026             h->luma_weight_flag[i]   = 0;
4027             h->chroma_weight_flag[i] = 0;
4028         }
4029     }
4030
4031     if(h->nal_ref_idc)
4032         decode_ref_pic_marking(h0, &s->gb);
4033
4034     if(FRAME_MBAFF)
4035         fill_mbaff_ref_list(h);
4036
4037     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
4038         direct_dist_scale_factor(h);
4039     direct_ref_list_init(h);
4040
4041     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4042         tmp = get_ue_golomb_31(&s->gb);
4043         if(tmp > 2){
4044             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4045             return -1;
4046         }
4047         h->cabac_init_idc= tmp;
4048     }
4049
4050     h->last_qscale_diff = 0;
4051     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4052     if(tmp>51){
4053         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4054         return -1;
4055     }
4056     s->qscale= tmp;
4057     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4058     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4059     //FIXME qscale / qp ... stuff
4060     if(h->slice_type == FF_SP_TYPE){
4061         get_bits1(&s->gb); /* sp_for_switch_flag */
4062     }
4063     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4064         get_se_golomb(&s->gb); /* slice_qs_delta */
4065     }
4066
4067     h->deblocking_filter = 1;
4068     h->slice_alpha_c0_offset = 0;
4069     h->slice_beta_offset = 0;
4070     if( h->pps.deblocking_filter_parameters_present ) {
4071         tmp= get_ue_golomb_31(&s->gb);
4072         if(tmp > 2){
4073             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4074             return -1;
4075         }
4076         h->deblocking_filter= tmp;
4077         if(h->deblocking_filter < 2)
4078             h->deblocking_filter^= 1; // 1<->0
4079
4080         if( h->deblocking_filter ) {
4081             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4082             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4083         }
4084     }
4085
4086     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4087        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4088        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4089        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4090         h->deblocking_filter= 0;
4091
4092     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4093         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4094             /* Cheat slightly for speed:
4095                Do not bother to deblock across slices. */
4096             h->deblocking_filter = 2;
4097         } else {
4098             h0->max_contexts = 1;
4099             if(!h0->single_decode_warning) {
4100                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4101                 h0->single_decode_warning = 1;
4102             }
4103             if(h != h0)
4104                 return 1; // deblocking switched inside frame
4105         }
4106     }
4107
4108 #if 0 //FMO
4109     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4110         slice_group_change_cycle= get_bits(&s->gb, ?);
4111 #endif
4112
4113     h0->last_slice_type = slice_type;
4114     h->slice_num = ++h0->current_slice;
4115     if(h->slice_num >= MAX_SLICES){
4116         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4117     }
4118
4119     for(j=0; j<2; j++){
4120         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4121         ref2frm[0]=
4122         ref2frm[1]= -1;
4123         for(i=0; i<16; i++)
4124             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4125                           +(h->ref_list[j][i].reference&3);
4126         ref2frm[18+0]=
4127         ref2frm[18+1]= -1;
4128         for(i=16; i<48; i++)
4129             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4130                           +(h->ref_list[j][i].reference&3);
4131     }
4132
4133     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4134     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4135
4136     s->avctx->refs= h->sps.ref_frame_count;
4137
4138     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4139         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4140                h->slice_num,
4141                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4142                first_mb_in_slice,
4143                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4144                pps_id, h->frame_num,
4145                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4146                h->ref_count[0], h->ref_count[1],
4147                s->qscale,
4148                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4149                h->use_weight,
4150                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4151                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4152                );
4153     }
4154
4155     return 0;
4156 }
4157
4158 /**
4159  *
4160  */
4161 static inline int get_level_prefix(GetBitContext *gb){
4162     unsigned int buf;
4163     int log;
4164
4165     OPEN_READER(re, gb);
4166     UPDATE_CACHE(re, gb);
4167     buf=GET_CACHE(re, gb);
4168
4169     log= 32 - av_log2(buf);
4170 #ifdef TRACE
4171     print_bin(buf>>(32-log), log);
4172     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4173 #endif
4174
4175     LAST_SKIP_BITS(re, gb, log);
4176     CLOSE_READER(re, gb);
4177
4178     return log-1;
4179 }
4180
4181 static inline int get_dct8x8_allowed(H264Context *h){
4182     if(h->sps.direct_8x8_inference_flag)
4183         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4184     else
4185         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4186 }
4187
4188 /**
4189  * decodes a residual block.
4190  * @param n block index
4191  * @param scantable scantable
4192  * @param max_coeff number of coefficients in the block
4193  * @return <0 if an error occurred
4194  */
4195 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4196     MpegEncContext * const s = &h->s;
4197     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4198     int level[16];
4199     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4200
4201     //FIXME put trailing_onex into the context
4202
4203     if(n == CHROMA_DC_BLOCK_INDEX){
4204         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4205         total_coeff= coeff_token>>2;
4206     }else{
4207         if(n == LUMA_DC_BLOCK_INDEX){
4208             total_coeff= pred_non_zero_count(h, 0);
4209             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4210             total_coeff= coeff_token>>2;
4211         }else{
4212             total_coeff= pred_non_zero_count(h, n);
4213             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4214             total_coeff= coeff_token>>2;
4215             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4216         }
4217     }
4218
4219     //FIXME set last_non_zero?
4220
4221     if(total_coeff==0)
4222         return 0;
4223     if(total_coeff > (unsigned)max_coeff) {
4224         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4225         return -1;
4226     }
4227
4228     trailing_ones= coeff_token&3;
4229     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4230     assert(total_coeff<=16);
4231
4232     i = show_bits(gb, 3);
4233     skip_bits(gb, trailing_ones);
4234     level[0] = 1-((i&4)>>1);
4235     level[1] = 1-((i&2)   );
4236     level[2] = 1-((i&1)<<1);
4237
4238     if(trailing_ones<total_coeff) {
4239         int mask, prefix;
4240         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4241         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4242         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4243
4244         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4245         if(level_code >= 100){
4246             prefix= level_code - 100;
4247             if(prefix == LEVEL_TAB_BITS)
4248                 prefix += get_level_prefix(gb);
4249
4250             //first coefficient has suffix_length equal to 0 or 1
4251             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4252                 if(suffix_length)
4253                     level_code= (prefix<<1) + get_bits1(gb); //part
4254                 else
4255                     level_code= prefix; //part
4256             }else if(prefix==14){
4257                 if(suffix_length)
4258                     level_code= (prefix<<1) + get_bits1(gb); //part
4259                 else
4260                     level_code= prefix + get_bits(gb, 4); //part
4261             }else{
4262                 level_code= 30 + get_bits(gb, prefix-3); //part
4263                 if(prefix>=16)
4264                     level_code += (1<<(prefix-3))-4096;
4265             }
4266
4267             if(trailing_ones < 3) level_code += 2;
4268
4269             suffix_length = 2;
4270             mask= -(level_code&1);
4271             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4272         }else{
4273             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4274
4275             suffix_length = 1;
4276             if(level_code + 3U > 6U)
4277                 suffix_length++;
4278             level[trailing_ones]= level_code;
4279         }
4280
4281         //remaining coefficients have suffix_length > 0
4282         for(i=trailing_ones+1;i<total_coeff;i++) {
4283             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4284             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4285             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4286
4287             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4288             if(level_code >= 100){
4289                 prefix= level_code - 100;
4290                 if(prefix == LEVEL_TAB_BITS){
4291                     prefix += get_level_prefix(gb);
4292                 }
4293                 if(prefix<15){
4294                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4295                 }else{
4296                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4297                     if(prefix>=16)
4298                         level_code += (1<<(prefix-3))-4096;
4299                 }
4300                 mask= -(level_code&1);
4301                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4302             }
4303             level[i]= level_code;
4304
4305             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4306                 suffix_length++;
4307         }
4308     }
4309
4310     if(total_coeff == max_coeff)
4311         zeros_left=0;
4312     else{
4313         if(n == CHROMA_DC_BLOCK_INDEX)
4314             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4315         else
4316             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4317     }
4318
4319     coeff_num = zeros_left + total_coeff - 1;
4320     j = scantable[coeff_num];
4321     if(n > 24){
4322         block[j] = level[0];
4323         for(i=1;i<total_coeff;i++) {
4324             if(zeros_left <= 0)
4325                 run_before = 0;
4326             else if(zeros_left < 7){
4327                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4328             }else{
4329                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4330             }
4331             zeros_left -= run_before;
4332             coeff_num -= 1 + run_before;
4333             j= scantable[ coeff_num ];
4334
4335             block[j]= level[i];
4336         }
4337     }else{
4338         block[j] = (level[0] * qmul[j] + 32)>>6;
4339         for(i=1;i<total_coeff;i++) {
4340             if(zeros_left <= 0)
4341                 run_before = 0;
4342             else if(zeros_left < 7){
4343                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4344             }else{
4345                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4346             }
4347             zeros_left -= run_before;
4348             coeff_num -= 1 + run_before;
4349             j= scantable[ coeff_num ];
4350
4351             block[j]= (level[i] * qmul[j] + 32)>>6;
4352         }
4353     }
4354
4355     if(zeros_left<0){
4356         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4357         return -1;
4358     }
4359
4360     return 0;
4361 }
4362
4363 static void predict_field_decoding_flag(H264Context *h){
4364     MpegEncContext * const s = &h->s;
4365     const int mb_xy= h->mb_xy;
4366     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4367                 ? s->current_picture.mb_type[mb_xy-1]
4368                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4369                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4370                 : 0;
4371     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4372 }
4373
4374 /**
4375  * decodes a P_SKIP or B_SKIP macroblock
4376  */
4377 static void decode_mb_skip(H264Context *h){
4378     MpegEncContext * const s = &h->s;
4379     const int mb_xy= h->mb_xy;
4380     int mb_type=0;
4381
4382     memset(h->non_zero_count[mb_xy], 0, 16);
4383     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4384
4385     if(MB_FIELD)
4386         mb_type|= MB_TYPE_INTERLACED;
4387
4388     if( h->slice_type_nos == FF_B_TYPE )
4389     {
4390         // just for fill_caches. pred_direct_motion will set the real mb_type
4391         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4392
4393         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4394         pred_direct_motion(h, &mb_type);
4395         mb_type|= MB_TYPE_SKIP;
4396     }
4397     else
4398     {
4399         int mx, my;
4400         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4401
4402         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4403         pred_pskip_motion(h, &mx, &my);
4404         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4405         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4406     }
4407
4408     write_back_motion(h, mb_type);
4409     s->current_picture.mb_type[mb_xy]= mb_type;
4410     s->current_picture.qscale_table[mb_xy]= s->qscale;
4411     h->slice_table[ mb_xy ]= h->slice_num;
4412     h->prev_mb_skipped= 1;
4413 }
4414
4415 /**
4416  * decodes a macroblock
4417  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4418  */
4419 static int decode_mb_cavlc(H264Context *h){
4420     MpegEncContext * const s = &h->s;
4421     int mb_xy;
4422     int partition_count;
4423     unsigned int mb_type, cbp;
4424     int dct8x8_allowed= h->pps.transform_8x8_mode;
4425
4426     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4427
4428     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4429     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4430                 down the code */
4431     if(h->slice_type_nos != FF_I_TYPE){
4432         if(s->mb_skip_run==-1)
4433             s->mb_skip_run= get_ue_golomb(&s->gb);
4434
4435         if (s->mb_skip_run--) {
4436             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4437                 if(s->mb_skip_run==0)
4438                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4439                 else
4440                     predict_field_decoding_flag(h);
4441             }
4442             decode_mb_skip(h);
4443             return 0;
4444         }
4445     }
4446     if(FRAME_MBAFF){
4447         if( (s->mb_y&1) == 0 )
4448             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4449     }
4450
4451     h->prev_mb_skipped= 0;
4452
4453     mb_type= get_ue_golomb(&s->gb);
4454     if(h->slice_type_nos == FF_B_TYPE){
4455         if(mb_type < 23){
4456             partition_count= b_mb_type_info[mb_type].partition_count;
4457             mb_type=         b_mb_type_info[mb_type].type;
4458         }else{
4459             mb_type -= 23;
4460             goto decode_intra_mb;
4461         }
4462     }else if(h->slice_type_nos == FF_P_TYPE){
4463         if(mb_type < 5){
4464             partition_count= p_mb_type_info[mb_type].partition_count;
4465             mb_type=         p_mb_type_info[mb_type].type;
4466         }else{
4467             mb_type -= 5;
4468             goto decode_intra_mb;
4469         }
4470     }else{
4471        assert(h->slice_type_nos == FF_I_TYPE);
4472         if(h->slice_type == FF_SI_TYPE && mb_type)
4473             mb_type--;
4474 decode_intra_mb:
4475         if(mb_type > 25){
4476             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4477             return -1;
4478         }
4479         partition_count=0;
4480         cbp= i_mb_type_info[mb_type].cbp;
4481         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4482         mb_type= i_mb_type_info[mb_type].type;
4483     }
4484
4485     if(MB_FIELD)
4486         mb_type |= MB_TYPE_INTERLACED;
4487
4488     h->slice_table[ mb_xy ]= h->slice_num;
4489
4490     if(IS_INTRA_PCM(mb_type)){
4491         unsigned int x;
4492
4493         // We assume these blocks are very rare so we do not optimize it.
4494         align_get_bits(&s->gb);
4495
4496         // The pixels are stored in the same order as levels in h->mb array.
4497         for(x=0; x < (CHROMA ? 384 : 256); x++){
4498             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4499         }
4500
4501         // In deblocking, the quantizer is 0
4502         s->current_picture.qscale_table[mb_xy]= 0;
4503         // All coeffs are present
4504         memset(h->non_zero_count[mb_xy], 16, 16);
4505
4506         s->current_picture.mb_type[mb_xy]= mb_type;
4507         return 0;
4508     }
4509
4510     if(MB_MBAFF){
4511         h->ref_count[0] <<= 1;
4512         h->ref_count[1] <<= 1;
4513     }
4514
4515     fill_caches(h, mb_type, 0);
4516
4517     //mb_pred
4518     if(IS_INTRA(mb_type)){
4519         int pred_mode;
4520 //            init_top_left_availability(h);
4521         if(IS_INTRA4x4(mb_type)){
4522             int i;
4523             int di = 1;
4524             if(dct8x8_allowed && get_bits1(&s->gb)){
4525                 mb_type |= MB_TYPE_8x8DCT;
4526                 di = 4;
4527             }
4528
4529 //                fill_intra4x4_pred_table(h);
4530             for(i=0; i<16; i+=di){
4531                 int mode= pred_intra_mode(h, i);
4532
4533                 if(!get_bits1(&s->gb)){
4534                     const int rem_mode= get_bits(&s->gb, 3);
4535                     mode = rem_mode + (rem_mode >= mode);
4536                 }
4537
4538                 if(di==4)
4539                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4540                 else
4541                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4542             }
4543             write_back_intra_pred_mode(h);
4544             if( check_intra4x4_pred_mode(h) < 0)
4545                 return -1;
4546         }else{
4547             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4548             if(h->intra16x16_pred_mode < 0)
4549                 return -1;
4550         }
4551         if(CHROMA){
4552             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4553             if(pred_mode < 0)
4554                 return -1;
4555             h->chroma_pred_mode= pred_mode;
4556         }
4557     }else if(partition_count==4){
4558         int i, j, sub_partition_count[4], list, ref[2][4];
4559
4560         if(h->slice_type_nos == FF_B_TYPE){
4561             for(i=0; i<4; i++){
4562                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4563                 if(h->sub_mb_type[i] >=13){
4564                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4565                     return -1;
4566                 }
4567                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4568                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4569             }
4570             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4571                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4572                 pred_direct_motion(h, &mb_type);
4573                 h->ref_cache[0][scan8[4]] =
4574                 h->ref_cache[1][scan8[4]] =
4575                 h->ref_cache[0][scan8[12]] =
4576                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4577             }
4578         }else{
4579             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4580             for(i=0; i<4; i++){
4581                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4582                 if(h->sub_mb_type[i] >=4){
4583                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4584                     return -1;
4585                 }
4586                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4587                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4588             }
4589         }
4590
4591         for(list=0; list<h->list_count; list++){
4592             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4593             for(i=0; i<4; i++){
4594                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4595                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4596                     unsigned int tmp;
4597                     if(ref_count == 1){
4598                         tmp= 0;
4599                     }else if(ref_count == 2){
4600                         tmp= get_bits1(&s->gb)^1;
4601                     }else{
4602                         tmp= get_ue_golomb_31(&s->gb);
4603                         if(tmp>=ref_count){
4604                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4605                             return -1;
4606                         }
4607                     }
4608                     ref[list][i]= tmp;
4609                 }else{
4610                  //FIXME
4611                     ref[list][i] = -1;
4612                 }
4613             }
4614         }
4615
4616         if(dct8x8_allowed)
4617             dct8x8_allowed = get_dct8x8_allowed(h);
4618
4619         for(list=0; list<h->list_count; list++){
4620             for(i=0; i<4; i++){
4621                 if(IS_DIRECT(h->sub_mb_type[i])) {
4622                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4623                     continue;
4624                 }
4625                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4626                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4627
4628                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4629                     const int sub_mb_type= h->sub_mb_type[i];
4630                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4631                     for(j=0; j<sub_partition_count[i]; j++){
4632                         int mx, my;
4633                         const int index= 4*i + block_width*j;
4634                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4635                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4636                         mx += get_se_golomb(&s->gb);
4637                         my += get_se_golomb(&s->gb);
4638                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4639
4640                         if(IS_SUB_8X8(sub_mb_type)){
4641                             mv_cache[ 1 ][0]=
4642                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4643                             mv_cache[ 1 ][1]=
4644                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4645                         }else if(IS_SUB_8X4(sub_mb_type)){
4646                             mv_cache[ 1 ][0]= mx;
4647                             mv_cache[ 1 ][1]= my;
4648                         }else if(IS_SUB_4X8(sub_mb_type)){
4649                             mv_cache[ 8 ][0]= mx;
4650                             mv_cache[ 8 ][1]= my;
4651                         }
4652                         mv_cache[ 0 ][0]= mx;
4653                         mv_cache[ 0 ][1]= my;
4654                     }
4655                 }else{
4656                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4657                     p[0] = p[1]=
4658                     p[8] = p[9]= 0;
4659                 }
4660             }
4661         }
4662     }else if(IS_DIRECT(mb_type)){
4663         pred_direct_motion(h, &mb_type);
4664         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4665     }else{
4666         int list, mx, my, i;
4667          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4668         if(IS_16X16(mb_type)){
4669             for(list=0; list<h->list_count; list++){
4670                     unsigned int val;
4671                     if(IS_DIR(mb_type, 0, list)){
4672                         if(h->ref_count[list]==1){
4673                             val= 0;
4674                         }else if(h->ref_count[list]==2){
4675                             val= get_bits1(&s->gb)^1;
4676                         }else{
4677                             val= get_ue_golomb_31(&s->gb);
4678                             if(val >= h->ref_count[list]){
4679                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4680                                 return -1;
4681                             }
4682                         }
4683                     }else
4684                         val= LIST_NOT_USED&0xFF;
4685                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4686             }
4687             for(list=0; list<h->list_count; list++){
4688                 unsigned int val;
4689                 if(IS_DIR(mb_type, 0, list)){
4690                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4691                     mx += get_se_golomb(&s->gb);
4692                     my += get_se_golomb(&s->gb);
4693                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4694
4695                     val= pack16to32(mx,my);
4696                 }else
4697                     val=0;
4698                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4699             }
4700         }
4701         else if(IS_16X8(mb_type)){
4702             for(list=0; list<h->list_count; list++){
4703                     for(i=0; i<2; i++){
4704                         unsigned int val;
4705                         if(IS_DIR(mb_type, i, list)){
4706                             if(h->ref_count[list] == 1){
4707                                 val= 0;
4708                             }else if(h->ref_count[list] == 2){
4709                                 val= get_bits1(&s->gb)^1;
4710                             }else{
4711                                 val= get_ue_golomb_31(&s->gb);
4712                                 if(val >= h->ref_count[list]){
4713                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4714                                     return -1;
4715                                 }
4716                             }
4717                         }else
4718                             val= LIST_NOT_USED&0xFF;
4719                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4720                     }
4721             }
4722             for(list=0; list<h->list_count; list++){
4723                 for(i=0; i<2; i++){
4724                     unsigned int val;
4725                     if(IS_DIR(mb_type, i, list)){
4726                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4727                         mx += get_se_golomb(&s->gb);
4728                         my += get_se_golomb(&s->gb);
4729                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4730
4731                         val= pack16to32(mx,my);
4732                     }else
4733                         val=0;
4734                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4735                 }
4736             }
4737         }else{
4738             assert(IS_8X16(mb_type));
4739             for(list=0; list<h->list_count; list++){
4740                     for(i=0; i<2; i++){
4741                         unsigned int val;
4742                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4743                             if(h->ref_count[list]==1){
4744                                 val= 0;
4745                             }else if(h->ref_count[list]==2){
4746                                 val= get_bits1(&s->gb)^1;
4747                             }else{
4748                                 val= get_ue_golomb_31(&s->gb);
4749                                 if(val >= h->ref_count[list]){
4750                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4751                                     return -1;
4752                                 }
4753                             }
4754                         }else
4755                             val= LIST_NOT_USED&0xFF;
4756                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4757                     }
4758             }
4759             for(list=0; list<h->list_count; list++){
4760                 for(i=0; i<2; i++){
4761                     unsigned int val;
4762                     if(IS_DIR(mb_type, i, list)){
4763                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4764                         mx += get_se_golomb(&s->gb);
4765                         my += get_se_golomb(&s->gb);
4766                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4767
4768                         val= pack16to32(mx,my);
4769                     }else
4770                         val=0;
4771                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4772                 }
4773             }
4774         }
4775     }
4776
4777     if(IS_INTER(mb_type))
4778         write_back_motion(h, mb_type);
4779
4780     if(!IS_INTRA16x16(mb_type)){
4781         cbp= get_ue_golomb(&s->gb);
4782         if(cbp > 47){
4783             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4784             return -1;
4785         }
4786
4787         if(CHROMA){
4788             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4789             else                     cbp= golomb_to_inter_cbp   [cbp];
4790         }else{
4791             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4792             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4793         }
4794     }
4795     h->cbp = cbp;
4796
4797     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4798         if(get_bits1(&s->gb)){
4799             mb_type |= MB_TYPE_8x8DCT;
4800             h->cbp_table[mb_xy]= cbp;
4801         }
4802     }
4803     s->current_picture.mb_type[mb_xy]= mb_type;
4804
4805     if(cbp || IS_INTRA16x16(mb_type)){
4806         int i8x8, i4x4, chroma_idx;
4807         int dquant;
4808         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4809         const uint8_t *scan, *scan8x8, *dc_scan;
4810
4811 //        fill_non_zero_count_cache(h);
4812
4813         if(IS_INTERLACED(mb_type)){
4814             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4815             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4816             dc_scan= luma_dc_field_scan;
4817         }else{
4818             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4819             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4820             dc_scan= luma_dc_zigzag_scan;
4821         }
4822
4823         dquant= get_se_golomb(&s->gb);
4824
4825         if( dquant > 25 || dquant < -26 ){
4826             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4827             return -1;
4828         }
4829
4830         s->qscale += dquant;
4831         if(((unsigned)s->qscale) > 51){
4832             if(s->qscale<0) s->qscale+= 52;
4833             else            s->qscale-= 52;
4834         }
4835
4836         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4837         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4838         if(IS_INTRA16x16(mb_type)){
4839             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4840                 return -1; //FIXME continue if partitioned and other return -1 too
4841             }
4842
4843             assert((cbp&15) == 0 || (cbp&15) == 15);
4844
4845             if(cbp&15){
4846                 for(i8x8=0; i8x8<4; i8x8++){
4847                     for(i4x4=0; i4x4<4; i4x4++){
4848                         const int index= i4x4 + 4*i8x8;
4849                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4850                             return -1;
4851                         }
4852                     }
4853                 }
4854             }else{
4855                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4856             }
4857         }else{
4858             for(i8x8=0; i8x8<4; i8x8++){
4859                 if(cbp & (1<<i8x8)){
4860                     if(IS_8x8DCT(mb_type)){
4861                         DCTELEM *buf = &h->mb[64*i8x8];
4862                         uint8_t *nnz;
4863                         for(i4x4=0; i4x4<4; i4x4++){
4864                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4865                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4866                                 return -1;
4867                         }
4868                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4869                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4870                     }else{
4871                         for(i4x4=0; i4x4<4; i4x4++){
4872                             const int index= i4x4 + 4*i8x8;
4873
4874                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4875                                 return -1;
4876                             }
4877                         }
4878                     }
4879                 }else{
4880                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4881                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4882                 }
4883             }
4884         }
4885
4886         if(cbp&0x30){
4887             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4888                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4889                     return -1;
4890                 }
4891         }
4892
4893         if(cbp&0x20){
4894             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4895                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4896                 for(i4x4=0; i4x4<4; i4x4++){
4897                     const int index= 16 + 4*chroma_idx + i4x4;
4898                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4899                         return -1;
4900                     }
4901                 }
4902             }
4903         }else{
4904             uint8_t * const nnz= &h->non_zero_count_cache[0];
4905             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4906             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4907         }
4908     }else{
4909         uint8_t * const nnz= &h->non_zero_count_cache[0];
4910         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4911         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4912         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4913     }
4914     s->current_picture.qscale_table[mb_xy]= s->qscale;
4915     write_back_non_zero_count(h);
4916
4917     if(MB_MBAFF){
4918         h->ref_count[0] >>= 1;
4919         h->ref_count[1] >>= 1;
4920     }
4921
4922     return 0;
4923 }
4924
4925 static int decode_cabac_field_decoding_flag(H264Context *h) {
4926     MpegEncContext * const s = &h->s;
4927     const int mb_x = s->mb_x;
4928     const int mb_y = s->mb_y & ~1;
4929     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4930     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4931
4932     unsigned int ctx = 0;
4933
4934     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4935         ctx += 1;
4936     }
4937     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4938         ctx += 1;
4939     }
4940
4941     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4942 }
4943
4944 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4945     uint8_t *state= &h->cabac_state[ctx_base];
4946     int mb_type;
4947
4948     if(intra_slice){
4949         MpegEncContext * const s = &h->s;
4950         const int mba_xy = h->left_mb_xy[0];
4951         const int mbb_xy = h->top_mb_xy;
4952         int ctx=0;
4953         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4954             ctx++;
4955         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4956             ctx++;
4957         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4958             return 0;   /* I4x4 */
4959         state += 2;
4960     }else{
4961         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4962             return 0;   /* I4x4 */
4963     }
4964
4965     if( get_cabac_terminate( &h->cabac ) )
4966         return 25;  /* PCM */
4967
4968     mb_type = 1; /* I16x16 */
4969     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4970     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4971         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4972     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4973     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4974     return mb_type;
4975 }
4976
4977 static int decode_cabac_mb_type_b( H264Context *h ) {
4978     MpegEncContext * const s = &h->s;
4979
4980         const int mba_xy = h->left_mb_xy[0];
4981         const int mbb_xy = h->top_mb_xy;
4982         int ctx = 0;
4983         int bits;
4984         assert(h->slice_type_nos == FF_B_TYPE);
4985
4986         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4987             ctx++;
4988         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4989             ctx++;
4990
4991         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4992             return 0; /* B_Direct_16x16 */
4993
4994         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4995             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4996         }
4997
4998         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4999         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5000         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5001         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5002         if( bits < 8 )
5003             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5004         else if( bits == 13 ) {
5005             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5006         } else if( bits == 14 )
5007             return 11; /* B_L1_L0_8x16 */
5008         else if( bits == 15 )
5009             return 22; /* B_8x8 */
5010
5011         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5012         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5013 }
5014
5015 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5016     MpegEncContext * const s = &h->s;
5017     int mba_xy, mbb_xy;
5018     int ctx = 0;
5019
5020     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5021         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5022         mba_xy = mb_xy - 1;
5023         if( (mb_y&1)
5024             && h->slice_table[mba_xy] == h->slice_num
5025             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5026             mba_xy += s->mb_stride;
5027         if( MB_FIELD ){
5028             mbb_xy = mb_xy - s->mb_stride;
5029             if( !(mb_y&1)
5030                 && h->slice_table[mbb_xy] == h->slice_num
5031                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5032                 mbb_xy -= s->mb_stride;
5033         }else
5034             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5035     }else{
5036         int mb_xy = h->mb_xy;
5037         mba_xy = mb_xy - 1;
5038         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5039     }
5040
5041     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5042         ctx++;
5043     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5044         ctx++;
5045
5046     if( h->slice_type_nos == FF_B_TYPE )
5047         ctx += 13;
5048     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5049 }
5050
5051 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5052     int mode = 0;
5053
5054     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5055         return pred_mode;
5056
5057     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5058     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5059     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5060
5061     if( mode >= pred_mode )
5062         return mode + 1;
5063     else
5064         return mode;
5065 }
5066
5067 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5068     const int mba_xy = h->left_mb_xy[0];
5069     const int mbb_xy = h->top_mb_xy;
5070
5071     int ctx = 0;
5072
5073     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5074     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5075         ctx++;
5076
5077     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5078         ctx++;
5079
5080     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5081         return 0;
5082
5083     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5084         return 1;
5085     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5086         return 2;
5087     else
5088         return 3;
5089 }
5090
5091 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5092     int cbp_b, cbp_a, ctx, cbp = 0;
5093
5094     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5095     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5096
5097     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5098     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5099     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5100     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5101     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5102     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5103     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5104     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5105     return cbp;
5106 }
5107 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5108     int ctx;
5109     int cbp_a, cbp_b;
5110
5111     cbp_a = (h->left_cbp>>4)&0x03;
5112     cbp_b = (h-> top_cbp>>4)&0x03;
5113
5114     ctx = 0;
5115     if( cbp_a > 0 ) ctx++;
5116     if( cbp_b > 0 ) ctx += 2;
5117     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5118         return 0;
5119
5120     ctx = 4;
5121     if( cbp_a == 2 ) ctx++;
5122     if( cbp_b == 2 ) ctx += 2;
5123     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5124 }
5125 static int decode_cabac_mb_dqp( H264Context *h) {
5126     int   ctx= h->last_qscale_diff != 0;
5127     int   val = 0;
5128
5129     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5130         ctx= 2+(ctx>>1);
5131         val++;
5132         if(val > 102) //prevent infinite loop
5133             return INT_MIN;
5134     }
5135
5136     if( val&0x01 )
5137         return   (val + 1)>>1 ;
5138     else
5139         return -((val + 1)>>1);
5140 }
5141 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5142     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5143         return 0;   /* 8x8 */
5144     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5145         return 1;   /* 8x4 */
5146     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5147         return 2;   /* 4x8 */
5148     return 3;       /* 4x4 */
5149 }
5150 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5151     int type;
5152     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5153         return 0;   /* B_Direct_8x8 */
5154     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5155         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5156     type = 3;
5157     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5158         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5159             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5160         type += 4;
5161     }
5162     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5163     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5164     return type;
5165 }
5166
5167 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5168     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5169 }
5170
5171 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5172     int refa = h->ref_cache[list][scan8[n] - 1];
5173     int refb = h->ref_cache[list][scan8[n] - 8];
5174     int ref  = 0;
5175     int ctx  = 0;
5176
5177     if( h->slice_type_nos == FF_B_TYPE) {
5178         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5179             ctx++;
5180         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5181             ctx += 2;
5182     } else {
5183         if( refa > 0 )
5184             ctx++;
5185         if( refb > 0 )
5186             ctx += 2;
5187     }
5188
5189     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5190         ref++;
5191         ctx = (ctx>>2)+4;
5192         if(ref >= 32 /*h->ref_list[list]*/){
5193             return -1;
5194         }
5195     }
5196     return ref;
5197 }
5198
5199 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5200     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5201                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5202     int ctxbase = (l == 0) ? 40 : 47;
5203     int mvd;
5204     int ctx = (amvd>2) + (amvd>32);
5205
5206     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5207         return 0;
5208
5209     mvd= 1;
5210     ctx= 3;
5211     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5212         mvd++;
5213         if( ctx < 6 )
5214             ctx++;
5215     }
5216
5217     if( mvd >= 9 ) {
5218         int k = 3;
5219         while( get_cabac_bypass( &h->cabac ) ) {
5220             mvd += 1 << k;
5221             k++;
5222             if(k>24){
5223                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5224                 return INT_MIN;
5225             }
5226         }
5227         while( k-- ) {
5228             if( get_cabac_bypass( &h->cabac ) )
5229                 mvd += 1 << k;
5230         }
5231     }
5232     return get_cabac_bypass_sign( &h->cabac, -mvd );
5233 }
5234
5235 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5236     int nza, nzb;
5237     int ctx = 0;
5238
5239     if( is_dc ) {
5240         if( cat == 0 ) {
5241             nza = h->left_cbp&0x100;
5242             nzb = h-> top_cbp&0x100;
5243         } else {
5244             nza = (h->left_cbp>>(6+idx))&0x01;
5245             nzb = (h-> top_cbp>>(6+idx))&0x01;
5246         }
5247     } else {
5248         assert(cat == 1 || cat == 2 || cat == 4);
5249         nza = h->non_zero_count_cache[scan8[idx] - 1];
5250         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5251     }
5252
5253     if( nza > 0 )
5254         ctx++;
5255
5256     if( nzb > 0 )
5257         ctx += 2;
5258
5259     return ctx + 4 * cat;
5260 }
5261
5262 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5263     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5264     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5265     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5266     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5267 };
5268
5269 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5270     static const int significant_coeff_flag_offset[2][6] = {
5271       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5272       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5273     };
5274     static const int last_coeff_flag_offset[2][6] = {
5275       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5276       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5277     };
5278     static const int coeff_abs_level_m1_offset[6] = {
5279         227+0, 227+10, 227+20, 227+30, 227+39, 426
5280     };
5281     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5282       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5283         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5284         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5285        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5286       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5287         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5288         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5289         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5290     };
5291     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5292      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5293      * map node ctx => cabac ctx for level=1 */
5294     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5295     /* map node ctx => cabac ctx for level>1 */
5296     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5297     static const uint8_t coeff_abs_level_transition[2][8] = {
5298     /* update node ctx after decoding a level=1 */
5299         { 1, 2, 3, 3, 4, 5, 6, 7 },
5300     /* update node ctx after decoding a level>1 */
5301         { 4, 4, 4, 4, 5, 6, 7, 7 }
5302     };
5303
5304     int index[64];
5305
5306     int av_unused last;
5307     int coeff_count = 0;
5308     int node_ctx = 0;
5309
5310     uint8_t *significant_coeff_ctx_base;
5311     uint8_t *last_coeff_ctx_base;
5312     uint8_t *abs_level_m1_ctx_base;
5313
5314 #if !ARCH_X86
5315 #define CABAC_ON_STACK
5316 #endif
5317 #ifdef CABAC_ON_STACK
5318 #define CC &cc
5319     CABACContext cc;
5320     cc.range     = h->cabac.range;
5321     cc.low       = h->cabac.low;
5322     cc.bytestream= h->cabac.bytestream;
5323 #else
5324 #define CC &h->cabac
5325 #endif
5326
5327
5328     /* cat: 0-> DC 16x16  n = 0
5329      *      1-> AC 16x16  n = luma4x4idx
5330      *      2-> Luma4x4   n = luma4x4idx
5331      *      3-> DC Chroma n = iCbCr
5332      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5333      *      5-> Luma8x8   n = 4 * luma8x8idx
5334      */
5335
5336     /* read coded block flag */
5337     if( is_dc || cat != 5 ) {
5338         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5339             if( !is_dc )
5340                 h->non_zero_count_cache[scan8[n]] = 0;
5341
5342 #ifdef CABAC_ON_STACK
5343             h->cabac.range     = cc.range     ;
5344             h->cabac.low       = cc.low       ;
5345             h->cabac.bytestream= cc.bytestream;
5346 #endif
5347             return;
5348         }
5349     }
5350
5351     significant_coeff_ctx_base = h->cabac_state
5352         + significant_coeff_flag_offset[MB_FIELD][cat];
5353     last_coeff_ctx_base = h->cabac_state
5354         + last_coeff_flag_offset[MB_FIELD][cat];
5355     abs_level_m1_ctx_base = h->cabac_state
5356         + coeff_abs_level_m1_offset[cat];
5357
5358     if( !is_dc && cat == 5 ) {
5359 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5360         for(last= 0; last < coefs; last++) { \
5361             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5362             if( get_cabac( CC, sig_ctx )) { \
5363                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5364                 index[coeff_count++] = last; \
5365                 if( get_cabac( CC, last_ctx ) ) { \
5366                     last= max_coeff; \
5367                     break; \
5368                 } \
5369             } \
5370         }\
5371         if( last == max_coeff -1 ) {\
5372             index[coeff_count++] = last;\
5373         }
5374         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5375 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5376         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5377     } else {
5378         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5379 #else
5380         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5381     } else {
5382         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5383 #endif
5384     }
5385     assert(coeff_count > 0);
5386
5387     if( is_dc ) {
5388         if( cat == 0 )
5389             h->cbp_table[h->mb_xy] |= 0x100;
5390         else
5391             h->cbp_table[h->mb_xy] |= 0x40 << n;
5392     } else {
5393         if( cat == 5 )
5394             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5395         else {
5396             assert( cat == 1 || cat == 2 || cat == 4 );
5397             h->non_zero_count_cache[scan8[n]] = coeff_count;
5398         }
5399     }
5400
5401     do {
5402         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5403
5404         int j= scantable[index[--coeff_count]];
5405
5406         if( get_cabac( CC, ctx ) == 0 ) {
5407             node_ctx = coeff_abs_level_transition[0][node_ctx];
5408             if( is_dc ) {
5409                 block[j] = get_cabac_bypass_sign( CC, -1);
5410             }else{
5411                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5412             }
5413         } else {
5414             int coeff_abs = 2;
5415             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5416             node_ctx = coeff_abs_level_transition[1][node_ctx];
5417
5418             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5419                 coeff_abs++;
5420             }
5421
5422             if( coeff_abs >= 15 ) {
5423                 int j = 0;
5424                 while( get_cabac_bypass( CC ) ) {
5425                     j++;
5426                 }
5427
5428                 coeff_abs=1;
5429                 while( j-- ) {
5430                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5431                 }
5432                 coeff_abs+= 14;
5433             }
5434
5435             if( is_dc ) {
5436                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5437             }else{
5438                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5439             }
5440         }
5441     } while( coeff_count );
5442 #ifdef CABAC_ON_STACK
5443             h->cabac.range     = cc.range     ;
5444             h->cabac.low       = cc.low       ;
5445             h->cabac.bytestream= cc.bytestream;
5446 #endif
5447
5448 }
5449
5450 #if !CONFIG_SMALL
5451 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5452     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5453 }
5454
5455 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5456     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5457 }
5458 #endif
5459
5460 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5461 #if CONFIG_SMALL
5462     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5463 #else
5464     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5465     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5466 #endif
5467 }
5468
5469 static inline void compute_mb_neighbors(H264Context *h)
5470 {
5471     MpegEncContext * const s = &h->s;
5472     const int mb_xy  = h->mb_xy;
5473     h->top_mb_xy     = mb_xy - s->mb_stride;
5474     h->left_mb_xy[0] = mb_xy - 1;
5475     if(FRAME_MBAFF){
5476         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5477         const int top_pair_xy      = pair_xy     - s->mb_stride;
5478         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5479         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5480         const int curr_mb_field_flag = MB_FIELD;
5481         const int bottom = (s->mb_y & 1);
5482
5483         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5484             h->top_mb_xy -= s->mb_stride;
5485         }
5486         if (!left_mb_field_flag == curr_mb_field_flag) {
5487             h->left_mb_xy[0] = pair_xy - 1;
5488         }
5489     } else if (FIELD_PICTURE) {
5490         h->top_mb_xy -= s->mb_stride;
5491     }
5492     return;
5493 }
5494
5495 /**
5496  * decodes a macroblock
5497  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5498  */
5499 static int decode_mb_cabac(H264Context *h) {
5500     MpegEncContext * const s = &h->s;
5501     int mb_xy;
5502     int mb_type, partition_count, cbp = 0;
5503     int dct8x8_allowed= h->pps.transform_8x8_mode;
5504
5505     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5506
5507     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5508     if( h->slice_type_nos != FF_I_TYPE ) {
5509         int skip;
5510         /* a skipped mb needs the aff flag from the following mb */
5511         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5512             predict_field_decoding_flag(h);
5513         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5514             skip = h->next_mb_skipped;
5515         else
5516             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5517         /* read skip flags */
5518         if( skip ) {
5519             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5520                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5521                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5522                 if(!h->next_mb_skipped)
5523                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5524             }
5525
5526             decode_mb_skip(h);
5527
5528             h->cbp_table[mb_xy] = 0;
5529             h->chroma_pred_mode_table[mb_xy] = 0;
5530             h->last_qscale_diff = 0;
5531
5532             return 0;
5533
5534         }
5535     }
5536     if(FRAME_MBAFF){
5537         if( (s->mb_y&1) == 0 )
5538             h->mb_mbaff =
5539             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5540     }
5541
5542     h->prev_mb_skipped = 0;
5543
5544     compute_mb_neighbors(h);
5545
5546     if( h->slice_type_nos == FF_B_TYPE ) {
5547         mb_type = decode_cabac_mb_type_b( h );
5548         if( mb_type < 23 ){
5549             partition_count= b_mb_type_info[mb_type].partition_count;
5550             mb_type=         b_mb_type_info[mb_type].type;
5551         }else{
5552             mb_type -= 23;
5553             goto decode_intra_mb;
5554         }
5555     } else if( h->slice_type_nos == FF_P_TYPE ) {
5556         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5557             /* P-type */
5558             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5559                 /* P_L0_D16x16, P_8x8 */
5560                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5561             } else {
5562                 /* P_L0_D8x16, P_L0_D16x8 */
5563                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5564             }
5565             partition_count= p_mb_type_info[mb_type].partition_count;
5566             mb_type=         p_mb_type_info[mb_type].type;
5567         } else {
5568             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5569             goto decode_intra_mb;
5570         }
5571     } else {
5572         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5573         if(h->slice_type == FF_SI_TYPE && mb_type)
5574             mb_type--;
5575         assert(h->slice_type_nos == FF_I_TYPE);
5576 decode_intra_mb:
5577         partition_count = 0;
5578         cbp= i_mb_type_info[mb_type].cbp;
5579         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5580         mb_type= i_mb_type_info[mb_type].type;
5581     }
5582     if(MB_FIELD)
5583         mb_type |= MB_TYPE_INTERLACED;
5584
5585     h->slice_table[ mb_xy ]= h->slice_num;
5586
5587     if(IS_INTRA_PCM(mb_type)) {
5588         const uint8_t *ptr;
5589
5590         // We assume these blocks are very rare so we do not optimize it.
5591         // FIXME The two following lines get the bitstream position in the cabac
5592         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5593         ptr= h->cabac.bytestream;
5594         if(h->cabac.low&0x1) ptr--;
5595         if(CABAC_BITS==16){
5596             if(h->cabac.low&0x1FF) ptr--;
5597         }
5598
5599         // The pixels are stored in the same order as levels in h->mb array.
5600         memcpy(h->mb, ptr, 256); ptr+=256;
5601         if(CHROMA){
5602             memcpy(h->mb+128, ptr, 128); ptr+=128;
5603         }
5604
5605         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5606
5607         // All blocks are present
5608         h->cbp_table[mb_xy] = 0x1ef;
5609         h->chroma_pred_mode_table[mb_xy] = 0;
5610         // In deblocking, the quantizer is 0
5611         s->current_picture.qscale_table[mb_xy]= 0;
5612         // All coeffs are present
5613         memset(h->non_zero_count[mb_xy], 16, 16);
5614         s->current_picture.mb_type[mb_xy]= mb_type;
5615         h->last_qscale_diff = 0;
5616         return 0;
5617     }
5618
5619     if(MB_MBAFF){
5620         h->ref_count[0] <<= 1;
5621         h->ref_count[1] <<= 1;
5622     }
5623
5624     fill_caches(h, mb_type, 0);
5625
5626     if( IS_INTRA( mb_type ) ) {
5627         int i, pred_mode;
5628         if( IS_INTRA4x4( mb_type ) ) {
5629             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5630                 mb_type |= MB_TYPE_8x8DCT;
5631                 for( i = 0; i < 16; i+=4 ) {
5632                     int pred = pred_intra_mode( h, i );
5633                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5634                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5635                 }
5636             } else {
5637                 for( i = 0; i < 16; i++ ) {
5638                     int pred = pred_intra_mode( h, i );
5639                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5640
5641                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5642                 }
5643             }
5644             write_back_intra_pred_mode(h);
5645             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5646         } else {
5647             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5648             if( h->intra16x16_pred_mode < 0 ) return -1;
5649         }
5650         if(CHROMA){
5651             h->chroma_pred_mode_table[mb_xy] =
5652             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5653
5654             pred_mode= check_intra_pred_mode( h, pred_mode );
5655             if( pred_mode < 0 ) return -1;
5656             h->chroma_pred_mode= pred_mode;
5657         }
5658     } else if( partition_count == 4 ) {
5659         int i, j, sub_partition_count[4], list, ref[2][4];
5660
5661         if( h->slice_type_nos == FF_B_TYPE ) {
5662             for( i = 0; i < 4; i++ ) {
5663                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5664                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5665                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5666             }
5667             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5668                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5669                 pred_direct_motion(h, &mb_type);
5670                 h->ref_cache[0][scan8[4]] =
5671                 h->ref_cache[1][scan8[4]] =
5672                 h->ref_cache[0][scan8[12]] =
5673                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5674                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5675                     for( i = 0; i < 4; i++ )
5676                         if( IS_DIRECT(h->sub_mb_type[i]) )
5677                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5678                 }
5679             }
5680         } else {
5681             for( i = 0; i < 4; i++ ) {
5682                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5683                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5684                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5685             }
5686         }
5687
5688         for( list = 0; list < h->list_count; list++ ) {
5689                 for( i = 0; i < 4; i++ ) {
5690                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5691                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5692                         if( h->ref_count[list] > 1 ){
5693                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5694                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5695                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5696                                 return -1;
5697                             }
5698                         }else
5699                             ref[list][i] = 0;
5700                     } else {
5701                         ref[list][i] = -1;
5702                     }
5703                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5704                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5705                 }
5706         }
5707
5708         if(dct8x8_allowed)
5709             dct8x8_allowed = get_dct8x8_allowed(h);
5710
5711         for(list=0; list<h->list_count; list++){
5712             for(i=0; i<4; i++){
5713                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5714                 if(IS_DIRECT(h->sub_mb_type[i])){
5715                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5716                     continue;
5717                 }
5718
5719                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5720                     const int sub_mb_type= h->sub_mb_type[i];
5721                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5722                     for(j=0; j<sub_partition_count[i]; j++){
5723                         int mpx, mpy;
5724                         int mx, my;
5725                         const int index= 4*i + block_width*j;
5726                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5727                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5728                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5729
5730                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5731                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5732                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5733
5734                         if(IS_SUB_8X8(sub_mb_type)){
5735                             mv_cache[ 1 ][0]=
5736                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5737                             mv_cache[ 1 ][1]=
5738                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5739
5740                             mvd_cache[ 1 ][0]=
5741                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5742                             mvd_cache[ 1 ][1]=
5743                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5744                         }else if(IS_SUB_8X4(sub_mb_type)){
5745                             mv_cache[ 1 ][0]= mx;
5746                             mv_cache[ 1 ][1]= my;
5747
5748                             mvd_cache[ 1 ][0]= mx - mpx;
5749                             mvd_cache[ 1 ][1]= my - mpy;
5750                         }else if(IS_SUB_4X8(sub_mb_type)){
5751                             mv_cache[ 8 ][0]= mx;
5752                             mv_cache[ 8 ][1]= my;
5753
5754                             mvd_cache[ 8 ][0]= mx - mpx;
5755                             mvd_cache[ 8 ][1]= my - mpy;
5756                         }
5757                         mv_cache[ 0 ][0]= mx;
5758                         mv_cache[ 0 ][1]= my;
5759
5760                         mvd_cache[ 0 ][0]= mx - mpx;
5761                         mvd_cache[ 0 ][1]= my - mpy;
5762                     }
5763                 }else{
5764                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5765                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5766                     p[0] = p[1] = p[8] = p[9] = 0;
5767                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5768                 }
5769             }
5770         }
5771     } else if( IS_DIRECT(mb_type) ) {
5772         pred_direct_motion(h, &mb_type);
5773         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5774         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5775         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5776     } else {
5777         int list, mx, my, i, mpx, mpy;
5778         if(IS_16X16(mb_type)){
5779             for(list=0; list<h->list_count; list++){
5780                 if(IS_DIR(mb_type, 0, list)){
5781                     int ref;
5782                     if(h->ref_count[list] > 1){
5783                         ref= decode_cabac_mb_ref(h, list, 0);
5784                         if(ref >= (unsigned)h->ref_count[list]){
5785                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5786                             return -1;
5787                         }
5788                     }else
5789                         ref=0;
5790                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5791                 }else
5792                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5793             }
5794             for(list=0; list<h->list_count; list++){
5795                 if(IS_DIR(mb_type, 0, list)){
5796                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5797
5798                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5799                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5800                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5801
5802                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5803                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5804                 }else
5805                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5806             }
5807         }
5808         else if(IS_16X8(mb_type)){
5809             for(list=0; list<h->list_count; list++){
5810                     for(i=0; i<2; i++){
5811                         if(IS_DIR(mb_type, i, list)){
5812                             int ref;
5813                             if(h->ref_count[list] > 1){
5814                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5815                                 if(ref >= (unsigned)h->ref_count[list]){
5816                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5817                                     return -1;
5818                                 }
5819                             }else
5820                                 ref=0;
5821                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5822                         }else
5823                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5824                     }
5825             }
5826             for(list=0; list<h->list_count; list++){
5827                 for(i=0; i<2; i++){
5828                     if(IS_DIR(mb_type, i, list)){
5829                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5830                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5831                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5832                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5833
5834                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5835                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5836                     }else{
5837                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5838                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5839                     }
5840                 }
5841             }
5842         }else{
5843             assert(IS_8X16(mb_type));
5844             for(list=0; list<h->list_count; list++){
5845                     for(i=0; i<2; i++){
5846                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5847                             int ref;
5848                             if(h->ref_count[list] > 1){
5849                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5850                                 if(ref >= (unsigned)h->ref_count[list]){
5851                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5852                                     return -1;
5853                                 }
5854                             }else
5855                                 ref=0;
5856                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5857                         }else
5858                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5859                     }
5860             }
5861             for(list=0; list<h->list_count; list++){
5862                 for(i=0; i<2; i++){
5863                     if(IS_DIR(mb_type, i, list)){
5864                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5865                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5866                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5867
5868                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5869                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5870                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5871                     }else{
5872                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5873                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5874                     }
5875                 }
5876             }
5877         }
5878     }
5879
5880    if( IS_INTER( mb_type ) ) {
5881         h->chroma_pred_mode_table[mb_xy] = 0;
5882         write_back_motion( h, mb_type );
5883    }
5884
5885     if( !IS_INTRA16x16( mb_type ) ) {
5886         cbp  = decode_cabac_mb_cbp_luma( h );
5887         if(CHROMA)
5888             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5889     }
5890
5891     h->cbp_table[mb_xy] = h->cbp = cbp;
5892
5893     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5894         if( decode_cabac_mb_transform_size( h ) )
5895             mb_type |= MB_TYPE_8x8DCT;
5896     }
5897     s->current_picture.mb_type[mb_xy]= mb_type;
5898
5899     if( cbp || IS_INTRA16x16( mb_type ) ) {
5900         const uint8_t *scan, *scan8x8, *dc_scan;
5901         const uint32_t *qmul;
5902         int dqp;
5903
5904         if(IS_INTERLACED(mb_type)){
5905             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5906             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5907             dc_scan= luma_dc_field_scan;
5908         }else{
5909             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5910             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5911             dc_scan= luma_dc_zigzag_scan;
5912         }
5913
5914         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5915         if( dqp == INT_MIN ){
5916             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5917             return -1;
5918         }
5919         s->qscale += dqp;
5920         if(((unsigned)s->qscale) > 51){
5921             if(s->qscale<0) s->qscale+= 52;
5922             else            s->qscale-= 52;
5923         }
5924         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5925         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5926
5927         if( IS_INTRA16x16( mb_type ) ) {
5928             int i;
5929             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5930             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5931
5932             if( cbp&15 ) {
5933                 qmul = h->dequant4_coeff[0][s->qscale];
5934                 for( i = 0; i < 16; i++ ) {
5935                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5936                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5937                 }
5938             } else {
5939                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5940             }
5941         } else {
5942             int i8x8, i4x4;
5943             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5944                 if( cbp & (1<<i8x8) ) {
5945                     if( IS_8x8DCT(mb_type) ) {
5946                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5947                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5948                     } else {
5949                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5950                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5951                             const int index = 4*i8x8 + i4x4;
5952                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5953 //START_TIMER
5954                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5955 //STOP_TIMER("decode_residual")
5956                         }
5957                     }
5958                 } else {
5959                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5960                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5961                 }
5962             }
5963         }
5964
5965         if( cbp&0x30 ){
5966             int c;
5967             for( c = 0; c < 2; c++ ) {
5968                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5969                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5970             }
5971         }
5972
5973         if( cbp&0x20 ) {
5974             int c, i;
5975             for( c = 0; c < 2; c++ ) {
5976                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5977                 for( i = 0; i < 4; i++ ) {
5978                     const int index = 16 + 4 * c + i;
5979                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5980                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5981                 }
5982             }
5983         } else {
5984             uint8_t * const nnz= &h->non_zero_count_cache[0];
5985             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5986             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5987         }
5988     } else {
5989         uint8_t * const nnz= &h->non_zero_count_cache[0];
5990         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5991         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5992         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5993         h->last_qscale_diff = 0;
5994     }
5995
5996     s->current_picture.qscale_table[mb_xy]= s->qscale;
5997     write_back_non_zero_count(h);
5998
5999     if(MB_MBAFF){
6000         h->ref_count[0] >>= 1;
6001         h->ref_count[1] >>= 1;
6002     }
6003
6004     return 0;
6005 }
6006
6007
6008 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6009     const int index_a = qp + h->slice_alpha_c0_offset;
6010     const int alpha = (alpha_table+52)[index_a];
6011     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6012     if (alpha ==0 || beta == 0) return;
6013
6014     if( bS[0] < 4 ) {
6015         int8_t tc[4];
6016         tc[0] = (tc0_table+52)[index_a][bS[0]];
6017         tc[1] = (tc0_table+52)[index_a][bS[1]];
6018         tc[2] = (tc0_table+52)[index_a][bS[2]];
6019         tc[3] = (tc0_table+52)[index_a][bS[3]];
6020         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6021     } else {
6022         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
6023     }
6024 }
6025 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6026     const int index_a = qp + h->slice_alpha_c0_offset;
6027     const int alpha = (alpha_table+52)[index_a];
6028     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6029     if (alpha ==0 || beta == 0) return;
6030
6031     if( bS[0] < 4 ) {
6032         int8_t tc[4];
6033         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6034         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6035         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6036         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6037         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6038     } else {
6039         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6040     }
6041 }
6042
6043 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6044     int i;
6045     for( i = 0; i < 16; i++, pix += stride) {
6046         int index_a;
6047         int alpha;
6048         int beta;
6049
6050         int qp_index;
6051         int bS_index = (i >> 1);
6052         if (!MB_FIELD) {
6053             bS_index &= ~1;
6054             bS_index |= (i & 1);
6055         }
6056
6057         if( bS[bS_index] == 0 ) {
6058             continue;
6059         }
6060
6061         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6062         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6063         alpha = (alpha_table+52)[index_a];
6064         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6065
6066         if( bS[bS_index] < 4 ) {
6067             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6068             const int p0 = pix[-1];
6069             const int p1 = pix[-2];
6070             const int p2 = pix[-3];
6071             const int q0 = pix[0];
6072             const int q1 = pix[1];
6073             const int q2 = pix[2];
6074
6075             if( FFABS( p0 - q0 ) < alpha &&
6076                 FFABS( p1 - p0 ) < beta &&
6077                 FFABS( q1 - q0 ) < beta ) {
6078                 int tc = tc0;
6079                 int i_delta;
6080
6081                 if( FFABS( p2 - p0 ) < beta ) {
6082                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6083                     tc++;
6084                 }
6085                 if( FFABS( q2 - q0 ) < beta ) {
6086                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6087                     tc++;
6088                 }
6089
6090                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6091                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6092                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6093                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6094             }
6095         }else{
6096             const int p0 = pix[-1];
6097             const int p1 = pix[-2];
6098             const int p2 = pix[-3];
6099
6100             const int q0 = pix[0];
6101             const int q1 = pix[1];
6102             const int q2 = pix[2];
6103
6104             if( FFABS( p0 - q0 ) < alpha &&
6105                 FFABS( p1 - p0 ) < beta &&
6106                 FFABS( q1 - q0 ) < beta ) {
6107
6108                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6109                     if( FFABS( p2 - p0 ) < beta)
6110                     {
6111                         const int p3 = pix[-4];
6112                         /* p0', p1', p2' */
6113                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6114                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6115                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6116                     } else {
6117                         /* p0' */
6118                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6119                     }
6120                     if( FFABS( q2 - q0 ) < beta)
6121                     {
6122                         const int q3 = pix[3];
6123                         /* q0', q1', q2' */
6124                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6125                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6126                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6127                     } else {
6128                         /* q0' */
6129                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6130                     }
6131                 }else{
6132                     /* p0', q0' */
6133                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6134                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6135                 }
6136                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6137             }
6138         }
6139     }
6140 }
6141 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6142     int i;
6143     for( i = 0; i < 8; i++, pix += stride) {
6144         int index_a;
6145         int alpha;
6146         int beta;
6147
6148         int qp_index;
6149         int bS_index = i;
6150
6151         if( bS[bS_index] == 0 ) {
6152             continue;
6153         }
6154
6155         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6156         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6157         alpha = (alpha_table+52)[index_a];
6158         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6159
6160         if( bS[bS_index] < 4 ) {
6161             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6162             const int p0 = pix[-1];
6163             const int p1 = pix[-2];
6164             const int q0 = pix[0];
6165             const int q1 = pix[1];
6166
6167             if( FFABS( p0 - q0 ) < alpha &&
6168                 FFABS( p1 - p0 ) < beta &&
6169                 FFABS( q1 - q0 ) < beta ) {
6170                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6171
6172                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6173                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6174                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6175             }
6176         }else{
6177             const int p0 = pix[-1];
6178             const int p1 = pix[-2];
6179             const int q0 = pix[0];
6180             const int q1 = pix[1];
6181
6182             if( FFABS( p0 - q0 ) < alpha &&
6183                 FFABS( p1 - p0 ) < beta &&
6184                 FFABS( q1 - q0 ) < beta ) {
6185
6186                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6187                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6188                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6189             }
6190         }
6191     }
6192 }
6193
6194 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6195     const int index_a = qp + h->slice_alpha_c0_offset;
6196     const int alpha = (alpha_table+52)[index_a];
6197     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6198     if (alpha ==0 || beta == 0) return;
6199
6200     if( bS[0] < 4 ) {
6201         int8_t tc[4];
6202         tc[0] = (tc0_table+52)[index_a][bS[0]];
6203         tc[1] = (tc0_table+52)[index_a][bS[1]];
6204         tc[2] = (tc0_table+52)[index_a][bS[2]];
6205         tc[3] = (tc0_table+52)[index_a][bS[3]];
6206         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6207     } else {
6208         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6209     }
6210 }
6211
6212 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6213     const int index_a = qp + h->slice_alpha_c0_offset;
6214     const int alpha = (alpha_table+52)[index_a];
6215     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6216     if (alpha ==0 || beta == 0) return;
6217
6218     if( bS[0] < 4 ) {
6219         int8_t tc[4];
6220         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6221         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6222         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6223         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6224         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6225     } else {
6226         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6227     }
6228 }
6229
6230 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6231     MpegEncContext * const s = &h->s;
6232     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6233     int mb_xy, mb_type;
6234     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6235
6236     mb_xy = h->mb_xy;
6237
6238     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6239         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6240        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6241                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6242         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6243         return;
6244     }
6245     assert(!FRAME_MBAFF);
6246
6247     mb_type = s->current_picture.mb_type[mb_xy];
6248     qp = s->current_picture.qscale_table[mb_xy];
6249     qp0 = s->current_picture.qscale_table[mb_xy-1];
6250     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6251     qpc = get_chroma_qp( h, 0, qp );
6252     qpc0 = get_chroma_qp( h, 0, qp0 );
6253     qpc1 = get_chroma_qp( h, 0, qp1 );
6254     qp0 = (qp + qp0 + 1) >> 1;
6255     qp1 = (qp + qp1 + 1) >> 1;
6256     qpc0 = (qpc + qpc0 + 1) >> 1;
6257     qpc1 = (qpc + qpc1 + 1) >> 1;
6258     qp_thresh = 15 - h->slice_alpha_c0_offset;
6259     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6260        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6261         return;
6262
6263     if( IS_INTRA(mb_type) ) {
6264         int16_t bS4[4] = {4,4,4,4};
6265         int16_t bS3[4] = {3,3,3,3};
6266         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6267         if( IS_8x8DCT(mb_type) ) {
6268             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6269             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6270             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6271             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6272         } else {
6273             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6274             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6275             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6276             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6277             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6278             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6279             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6280             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6281         }
6282         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6283         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6284         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6285         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6286         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6287         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6288         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6289         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6290         return;
6291     } else {
6292         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6293         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6294         int edges;
6295         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6296             edges = 4;
6297             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6298         } else {
6299             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6300                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6301             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6302                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6303                              ? 3 : 0;
6304             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6305             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6306             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6307                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6308         }
6309         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6310             bSv[0][0] = 0x0004000400040004ULL;
6311         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6312             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6313
6314 #define FILTER(hv,dir,edge)\
6315         if(bSv[dir][edge]) {\
6316             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6317             if(!(edge&1)) {\
6318                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6319                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6320             }\
6321         }
6322         if( edges == 1 ) {
6323             FILTER(v,0,0);
6324             FILTER(h,1,0);
6325         } else if( IS_8x8DCT(mb_type) ) {
6326             FILTER(v,0,0);
6327             FILTER(v,0,2);
6328             FILTER(h,1,0);
6329             FILTER(h,1,2);
6330         } else {
6331             FILTER(v,0,0);
6332             FILTER(v,0,1);
6333             FILTER(v,0,2);
6334             FILTER(v,0,3);
6335             FILTER(h,1,0);
6336             FILTER(h,1,1);
6337             FILTER(h,1,2);
6338             FILTER(h,1,3);
6339         }
6340 #undef FILTER
6341     }
6342 }
6343
6344
6345 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6346     MpegEncContext * const s = &h->s;
6347     int edge;
6348     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6349     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6350     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6351     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6352     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6353
6354     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6355                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6356     // how often to recheck mv-based bS when iterating between edges
6357     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6358                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6359     // how often to recheck mv-based bS when iterating along each edge
6360     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6361
6362     if (first_vertical_edge_done) {
6363         start = 1;
6364     }
6365
6366     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6367         start = 1;
6368
6369     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6370         && !IS_INTERLACED(mb_type)
6371         && IS_INTERLACED(mbm_type)
6372         ) {
6373         // This is a special case in the norm where the filtering must
6374         // be done twice (one each of the field) even if we are in a
6375         // frame macroblock.
6376         //
6377         static const int nnz_idx[4] = {4,5,6,3};
6378         unsigned int tmp_linesize   = 2 *   linesize;
6379         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6380         int mbn_xy = mb_xy - 2 * s->mb_stride;
6381         int qp;
6382         int i, j;
6383         int16_t bS[4];
6384
6385         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6386             if( IS_INTRA(mb_type) ||
6387                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6388                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6389             } else {
6390                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6391                 for( i = 0; i < 4; i++ ) {
6392                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6393                         mbn_nnz[nnz_idx[i]] != 0 )
6394                         bS[i] = 2;
6395                     else
6396                         bS[i] = 1;
6397                 }
6398             }
6399             // Do not use s->qscale as luma quantizer because it has not the same
6400             // value in IPCM macroblocks.
6401             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6402             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6403             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6404             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6405             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6406                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6407             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6408                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6409         }
6410
6411         start = 1;
6412     }
6413
6414     /* Calculate bS */
6415     for( edge = start; edge < edges; edge++ ) {
6416         /* mbn_xy: neighbor macroblock */
6417         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6418         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6419         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6420         int16_t bS[4];
6421         int qp;
6422
6423         if( (edge&1) && IS_8x8DCT(mb_type) )
6424             continue;
6425
6426         if( IS_INTRA(mb_type) ||
6427             IS_INTRA(mbn_type) ) {
6428             int value;
6429             if (edge == 0) {
6430                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6431                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6432                 ) {
6433                     value = 4;
6434                 } else {
6435                     value = 3;
6436                 }
6437             } else {
6438                 value = 3;
6439             }
6440             bS[0] = bS[1] = bS[2] = bS[3] = value;
6441         } else {
6442             int i, l;
6443             int mv_done;
6444
6445             if( edge & mask_edge ) {
6446                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6447                 mv_done = 1;
6448             }
6449             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6450                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6451                 mv_done = 1;
6452             }
6453             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6454                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6455                 int bn_idx= b_idx - (dir ? 8:1);
6456                 int v = 0;
6457
6458                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6459                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6460                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6461                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6462                 }
6463
6464                 if(h->slice_type_nos == FF_B_TYPE && v){
6465                     v=0;
6466                     for( l = 0; !v && l < 2; l++ ) {
6467                         int ln= 1-l;
6468                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6469                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6470                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6471                     }
6472                 }
6473
6474                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6475                 mv_done = 1;
6476             }
6477             else
6478                 mv_done = 0;
6479
6480             for( i = 0; i < 4; i++ ) {
6481                 int x = dir == 0 ? edge : i;
6482                 int y = dir == 0 ? i    : edge;
6483                 int b_idx= 8 + 4 + x + 8*y;
6484                 int bn_idx= b_idx - (dir ? 8:1);
6485
6486                 if( h->non_zero_count_cache[b_idx] |
6487                     h->non_zero_count_cache[bn_idx] ) {
6488                     bS[i] = 2;
6489                 }
6490                 else if(!mv_done)
6491                 {
6492                     bS[i] = 0;
6493                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6494                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6495                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6496                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6497                             bS[i] = 1;
6498                             break;
6499                         }
6500                     }
6501
6502                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6503                         bS[i] = 0;
6504                         for( l = 0; l < 2; l++ ) {
6505                             int ln= 1-l;
6506                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6507                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6508                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6509                                 bS[i] = 1;
6510                                 break;
6511                             }
6512                         }
6513                     }
6514                 }
6515             }
6516
6517             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6518                 continue;
6519         }
6520
6521         /* Filter edge */
6522         // Do not use s->qscale as luma quantizer because it has not the same
6523         // value in IPCM macroblocks.
6524         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6525         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6526         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6527         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6528         if( dir == 0 ) {
6529             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6530             if( (edge&1) == 0 ) {
6531                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6532                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6533                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6534                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6535             }
6536         } else {
6537             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6538             if( (edge&1) == 0 ) {
6539                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6540                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6541                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6542                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6543             }
6544         }
6545     }
6546 }
6547
6548 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6549     MpegEncContext * const s = &h->s;
6550     const int mb_xy= mb_x + mb_y*s->mb_stride;
6551     const int mb_type = s->current_picture.mb_type[mb_xy];
6552     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6553     int first_vertical_edge_done = 0;
6554     av_unused int dir;
6555
6556     //for sufficiently low qp, filtering wouldn't do anything
6557     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6558     if(!FRAME_MBAFF){
6559         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6560         int qp = s->current_picture.qscale_table[mb_xy];
6561         if(qp <= qp_thresh
6562            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6563            && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6564             return;
6565         }
6566     }
6567
6568     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6569     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6570         int top_type, left_type[2];
6571         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6572         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6573         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6574
6575         if(IS_8x8DCT(top_type)){
6576             h->non_zero_count_cache[4+8*0]=
6577             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6578             h->non_zero_count_cache[6+8*0]=
6579             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6580         }
6581         if(IS_8x8DCT(left_type[0])){
6582             h->non_zero_count_cache[3+8*1]=
6583             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6584         }
6585         if(IS_8x8DCT(left_type[1])){
6586             h->non_zero_count_cache[3+8*3]=
6587             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6588         }
6589
6590         if(IS_8x8DCT(mb_type)){
6591             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6592             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6593
6594             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6595             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6596
6597             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6598             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6599
6600             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6601             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6602         }
6603     }
6604
6605     if (FRAME_MBAFF
6606             // left mb is in picture
6607             && h->slice_table[mb_xy-1] != 0xFFFF
6608             // and current and left pair do not have the same interlaced type
6609             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6610             // and left mb is in the same slice if deblocking_filter == 2
6611             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6612         /* First vertical edge is different in MBAFF frames
6613          * There are 8 different bS to compute and 2 different Qp
6614          */
6615         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6616         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6617         int16_t bS[8];
6618         int qp[2];
6619         int bqp[2];
6620         int rqp[2];
6621         int mb_qp, mbn0_qp, mbn1_qp;
6622         int i;
6623         first_vertical_edge_done = 1;
6624
6625         if( IS_INTRA(mb_type) )
6626             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6627         else {
6628             for( i = 0; i < 8; i++ ) {
6629                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6630
6631                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6632                     bS[i] = 4;
6633                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6634                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6635                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6636                                                                        :
6637                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6638                     bS[i] = 2;
6639                 else
6640                     bS[i] = 1;
6641             }
6642         }
6643
6644         mb_qp = s->current_picture.qscale_table[mb_xy];
6645         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6646         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6647         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6648         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6649                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6650         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6651                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6652         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6653         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6654                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6655         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6656                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6657
6658         /* Filter edge */
6659         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6660         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6661         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6662         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6663         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6664     }
6665
6666 #if CONFIG_SMALL
6667     for( dir = 0; dir < 2; dir++ )
6668         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6669 #else
6670     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6671     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6672 #endif
6673 }
6674
6675 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6676     H264Context *h = *(void**)arg;
6677     MpegEncContext * const s = &h->s;
6678     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6679
6680     s->mb_skip_run= -1;
6681
6682     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6683                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6684
6685     if( h->pps.cabac ) {
6686         int i;
6687
6688         /* realign */
6689         align_get_bits( &s->gb );
6690
6691         /* init cabac */
6692         ff_init_cabac_states( &h->cabac);
6693         ff_init_cabac_decoder( &h->cabac,
6694                                s->gb.buffer + get_bits_count(&s->gb)/8,
6695                                (get_bits_left(&s->gb) + 7)/8);
6696         /* calculate pre-state */
6697         for( i= 0; i < 460; i++ ) {
6698             int pre;
6699             if( h->slice_type_nos == FF_I_TYPE )
6700                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6701             else
6702                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6703
6704             if( pre <= 63 )
6705                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6706             else
6707                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6708         }
6709
6710         for(;;){
6711 //START_TIMER
6712             int ret = decode_mb_cabac(h);
6713             int eos;
6714 //STOP_TIMER("decode_mb_cabac")
6715
6716             if(ret>=0) hl_decode_mb(h);
6717
6718             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6719                 s->mb_y++;
6720
6721                 ret = decode_mb_cabac(h);
6722
6723                 if(ret>=0) hl_decode_mb(h);
6724                 s->mb_y--;
6725             }
6726             eos = get_cabac_terminate( &h->cabac );
6727
6728             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6729                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6730                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6731                 return -1;
6732             }
6733
6734             if( ++s->mb_x >= s->mb_width ) {
6735                 s->mb_x = 0;
6736                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6737                 ++s->mb_y;
6738                 if(FIELD_OR_MBAFF_PICTURE) {
6739                     ++s->mb_y;
6740                 }
6741             }
6742
6743             if( eos || s->mb_y >= s->mb_height ) {
6744                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6745                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6746                 return 0;
6747             }
6748         }
6749
6750     } else {
6751         for(;;){
6752             int ret = decode_mb_cavlc(h);
6753
6754             if(ret>=0) hl_decode_mb(h);
6755
6756             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6757                 s->mb_y++;
6758                 ret = decode_mb_cavlc(h);
6759
6760                 if(ret>=0) hl_decode_mb(h);
6761                 s->mb_y--;
6762             }
6763
6764             if(ret<0){
6765                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6766                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6767
6768                 return -1;
6769             }
6770
6771             if(++s->mb_x >= s->mb_width){
6772                 s->mb_x=0;
6773                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6774                 ++s->mb_y;
6775                 if(FIELD_OR_MBAFF_PICTURE) {
6776                     ++s->mb_y;
6777                 }
6778                 if(s->mb_y >= s->mb_height){
6779                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6780
6781                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6782                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6783
6784                         return 0;
6785                     }else{
6786                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6787
6788                         return -1;
6789                     }
6790                 }
6791             }
6792
6793             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6794                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6795                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6796                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6797
6798                     return 0;
6799                 }else{
6800                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6801
6802                     return -1;
6803                 }
6804             }
6805         }
6806     }
6807
6808 #if 0
6809     for(;s->mb_y < s->mb_height; s->mb_y++){
6810         for(;s->mb_x < s->mb_width; s->mb_x++){
6811             int ret= decode_mb(h);
6812
6813             hl_decode_mb(h);
6814
6815             if(ret<0){
6816                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6817                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6818
6819                 return -1;
6820             }
6821
6822             if(++s->mb_x >= s->mb_width){
6823                 s->mb_x=0;
6824                 if(++s->mb_y >= s->mb_height){
6825                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6826                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6827
6828                         return 0;
6829                     }else{
6830                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6831
6832                         return -1;
6833                     }
6834                 }
6835             }
6836
6837             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6838                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6839                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6840
6841                     return 0;
6842                 }else{
6843                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6844
6845                     return -1;
6846                 }
6847             }
6848         }
6849         s->mb_x=0;
6850         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6851     }
6852 #endif
6853     return -1; //not reached
6854 }
6855
6856 static int decode_picture_timing(H264Context *h){
6857     MpegEncContext * const s = &h->s;
6858     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6859         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6860         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6861     }
6862     if(h->sps.pic_struct_present_flag){
6863         unsigned int i, num_clock_ts;
6864         h->sei_pic_struct = get_bits(&s->gb, 4);
6865         h->sei_ct_type    = 0;
6866
6867         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6868             return -1;
6869
6870         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6871
6872         for (i = 0 ; i < num_clock_ts ; i++){
6873             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6874                 unsigned int full_timestamp_flag;
6875                 h->sei_ct_type |= 1<<get_bits(&s->gb, 2);
6876                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6877                 skip_bits(&s->gb, 5);                 /* counting_type */
6878                 full_timestamp_flag = get_bits(&s->gb, 1);
6879                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6880                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6881                 skip_bits(&s->gb, 8);                 /* n_frames */
6882                 if(full_timestamp_flag){
6883                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6884                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6885                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6886                 }else{
6887                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6888                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6889                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6890                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6891                             if(get_bits(&s->gb, 1))   /* hours_flag */
6892                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6893                         }
6894                     }
6895                 }
6896                 if(h->sps.time_offset_length > 0)
6897                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6898             }
6899         }
6900
6901         if(s->avctx->debug & FF_DEBUG_PICT_INFO)
6902             av_log(s->avctx, AV_LOG_DEBUG, "ct_type:%X pic_struct:%d\n", h->sei_ct_type, h->sei_pic_struct);
6903     }
6904     return 0;
6905 }
6906
6907 static int decode_unregistered_user_data(H264Context *h, int size){
6908     MpegEncContext * const s = &h->s;
6909     uint8_t user_data[16+256];
6910     int e, build, i;
6911
6912     if(size<16)
6913         return -1;
6914
6915     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6916         user_data[i]= get_bits(&s->gb, 8);
6917     }
6918
6919     user_data[i]= 0;
6920     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6921     if(e==1 && build>=0)
6922         h->x264_build= build;
6923
6924     if(s->avctx->debug & FF_DEBUG_BUGS)
6925         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6926
6927     for(; i<size; i++)
6928         skip_bits(&s->gb, 8);
6929
6930     return 0;
6931 }
6932
6933 static int decode_recovery_point(H264Context *h){
6934     MpegEncContext * const s = &h->s;
6935
6936     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6937     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6938
6939     return 0;
6940 }
6941
6942 static int decode_buffering_period(H264Context *h){
6943     MpegEncContext * const s = &h->s;
6944     unsigned int sps_id;
6945     int sched_sel_idx;
6946     SPS *sps;
6947
6948     sps_id = get_ue_golomb_31(&s->gb);
6949     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6950         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6951         return -1;
6952     }
6953     sps = h->sps_buffers[sps_id];
6954
6955     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6956     if (sps->nal_hrd_parameters_present_flag) {
6957         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6958             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6959             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6960         }
6961     }
6962     if (sps->vcl_hrd_parameters_present_flag) {
6963         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6964             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6965             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6966         }
6967     }
6968
6969     h->sei_buffering_period_present = 1;
6970     return 0;
6971 }
6972
6973 int ff_h264_decode_sei(H264Context *h){
6974     MpegEncContext * const s = &h->s;
6975
6976     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6977         int size, type;
6978
6979         type=0;
6980         do{
6981             type+= show_bits(&s->gb, 8);
6982         }while(get_bits(&s->gb, 8) == 255);
6983
6984         size=0;
6985         do{
6986             size+= show_bits(&s->gb, 8);
6987         }while(get_bits(&s->gb, 8) == 255);
6988
6989         switch(type){
6990         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6991             if(decode_picture_timing(h) < 0)
6992                 return -1;
6993             break;
6994         case SEI_TYPE_USER_DATA_UNREGISTERED:
6995             if(decode_unregistered_user_data(h, size) < 0)
6996                 return -1;
6997             break;
6998         case SEI_TYPE_RECOVERY_POINT:
6999             if(decode_recovery_point(h) < 0)
7000                 return -1;
7001             break;
7002         case SEI_BUFFERING_PERIOD:
7003             if(decode_buffering_period(h) < 0)
7004                 return -1;
7005             break;
7006         default:
7007             skip_bits(&s->gb, 8*size);
7008         }
7009
7010         //FIXME check bits here
7011         align_get_bits(&s->gb);
7012     }
7013
7014     return 0;
7015 }
7016
7017 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
7018     MpegEncContext * const s = &h->s;
7019     int cpb_count, i;
7020     cpb_count = get_ue_golomb_31(&s->gb) + 1;
7021
7022     if(cpb_count > 32U){
7023         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
7024         return -1;
7025     }
7026
7027     get_bits(&s->gb, 4); /* bit_rate_scale */
7028     get_bits(&s->gb, 4); /* cpb_size_scale */
7029     for(i=0; i<cpb_count; i++){
7030         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7031         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7032         get_bits1(&s->gb);     /* cbr_flag */
7033     }
7034     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7035     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7036     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
7037     sps->time_offset_length = get_bits(&s->gb, 5);
7038     sps->cpb_cnt = cpb_count;
7039     return 0;
7040 }
7041
7042 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7043     MpegEncContext * const s = &h->s;
7044     int aspect_ratio_info_present_flag;
7045     unsigned int aspect_ratio_idc;
7046
7047     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7048
7049     if( aspect_ratio_info_present_flag ) {
7050         aspect_ratio_idc= get_bits(&s->gb, 8);
7051         if( aspect_ratio_idc == EXTENDED_SAR ) {
7052             sps->sar.num= get_bits(&s->gb, 16);
7053             sps->sar.den= get_bits(&s->gb, 16);
7054         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
7055             sps->sar=  pixel_aspect[aspect_ratio_idc];
7056         }else{
7057             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7058             return -1;
7059         }
7060     }else{
7061         sps->sar.num=
7062         sps->sar.den= 0;
7063     }
7064 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7065
7066     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7067         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7068     }
7069
7070     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7071         get_bits(&s->gb, 3);    /* video_format */
7072         get_bits1(&s->gb);      /* video_full_range_flag */
7073         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7074             get_bits(&s->gb, 8); /* colour_primaries */
7075             get_bits(&s->gb, 8); /* transfer_characteristics */
7076             get_bits(&s->gb, 8); /* matrix_coefficients */
7077         }
7078     }
7079
7080     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7081         s->avctx->chroma_sample_location = get_ue_golomb(&s->gb)+1;  /* chroma_sample_location_type_top_field */
7082         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7083     }
7084
7085     sps->timing_info_present_flag = get_bits1(&s->gb);
7086     if(sps->timing_info_present_flag){
7087         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7088         sps->time_scale = get_bits_long(&s->gb, 32);
7089         if(sps->num_units_in_tick-1 > 0x7FFFFFFEU || sps->time_scale-1 > 0x7FFFFFFEU){
7090             av_log(h->s.avctx, AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick);
7091             return -1;
7092         }
7093         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7094     }
7095
7096     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7097     if(sps->nal_hrd_parameters_present_flag)
7098         if(decode_hrd_parameters(h, sps) < 0)
7099             return -1;
7100     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7101     if(sps->vcl_hrd_parameters_present_flag)
7102         if(decode_hrd_parameters(h, sps) < 0)
7103             return -1;
7104     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7105         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7106     sps->pic_struct_present_flag = get_bits1(&s->gb);
7107
7108     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7109     if(sps->bitstream_restriction_flag){
7110         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7111         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7112         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7113         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7114         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7115         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7116         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7117
7118         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7119             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7120             return -1;
7121         }
7122     }
7123
7124     return 0;
7125 }
7126
7127 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7128                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7129     MpegEncContext * const s = &h->s;
7130     int i, last = 8, next = 8;
7131     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7132     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7133         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7134     else
7135     for(i=0;i<size;i++){
7136         if(next)
7137             next = (last + get_se_golomb(&s->gb)) & 0xff;
7138         if(!i && !next){ /* matrix not written, we use the preset one */
7139             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7140             break;
7141         }
7142         last = factors[scan[i]] = next ? next : last;
7143     }
7144 }
7145
7146 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7147                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7148     MpegEncContext * const s = &h->s;
7149     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7150     const uint8_t *fallback[4] = {
7151         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7152         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7153         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7154         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7155     };
7156     if(get_bits1(&s->gb)){
7157         sps->scaling_matrix_present |= is_sps;
7158         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7159         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7160         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7161         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7162         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7163         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7164         if(is_sps || pps->transform_8x8_mode){
7165             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7166             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7167         }
7168     }
7169 }
7170
7171 int ff_h264_decode_seq_parameter_set(H264Context *h){
7172     MpegEncContext * const s = &h->s;
7173     int profile_idc, level_idc;
7174     unsigned int sps_id;
7175     int i;
7176     SPS *sps;
7177
7178     profile_idc= get_bits(&s->gb, 8);
7179     get_bits1(&s->gb);   //constraint_set0_flag
7180     get_bits1(&s->gb);   //constraint_set1_flag
7181     get_bits1(&s->gb);   //constraint_set2_flag
7182     get_bits1(&s->gb);   //constraint_set3_flag
7183     get_bits(&s->gb, 4); // reserved
7184     level_idc= get_bits(&s->gb, 8);
7185     sps_id= get_ue_golomb_31(&s->gb);
7186
7187     if(sps_id >= MAX_SPS_COUNT) {
7188         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7189         return -1;
7190     }
7191     sps= av_mallocz(sizeof(SPS));
7192     if(sps == NULL)
7193         return -1;
7194
7195     sps->profile_idc= profile_idc;
7196     sps->level_idc= level_idc;
7197
7198     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7199     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7200     sps->scaling_matrix_present = 0;
7201
7202     if(sps->profile_idc >= 100){ //high profile
7203         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7204         if(sps->chroma_format_idc == 3)
7205             sps->residual_color_transform_flag = get_bits1(&s->gb);
7206         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7207         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7208         sps->transform_bypass = get_bits1(&s->gb);
7209         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7210     }else{
7211         sps->chroma_format_idc= 1;
7212     }
7213
7214     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7215     sps->poc_type= get_ue_golomb_31(&s->gb);
7216
7217     if(sps->poc_type == 0){ //FIXME #define
7218         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7219     } else if(sps->poc_type == 1){//FIXME #define
7220         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7221         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7222         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7223         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7224
7225         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7226             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7227             goto fail;
7228         }
7229
7230         for(i=0; i<sps->poc_cycle_length; i++)
7231             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7232     }else if(sps->poc_type != 2){
7233         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7234         goto fail;
7235     }
7236
7237     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7238     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7239         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7240         goto fail;
7241     }
7242     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7243     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7244     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7245     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7246        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7247         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7248         goto fail;
7249     }
7250
7251     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7252     if(!sps->frame_mbs_only_flag)
7253         sps->mb_aff= get_bits1(&s->gb);
7254     else
7255         sps->mb_aff= 0;
7256
7257     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7258
7259 #ifndef ALLOW_INTERLACE
7260     if(sps->mb_aff)
7261         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7262 #endif
7263     sps->crop= get_bits1(&s->gb);
7264     if(sps->crop){
7265         sps->crop_left  = get_ue_golomb(&s->gb);
7266         sps->crop_right = get_ue_golomb(&s->gb);
7267         sps->crop_top   = get_ue_golomb(&s->gb);
7268         sps->crop_bottom= get_ue_golomb(&s->gb);
7269         if(sps->crop_left || sps->crop_top){
7270             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7271         }
7272         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7273             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7274         }
7275     }else{
7276         sps->crop_left  =
7277         sps->crop_right =
7278         sps->crop_top   =
7279         sps->crop_bottom= 0;
7280     }
7281
7282     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7283     if( sps->vui_parameters_present_flag )
7284         if (decode_vui_parameters(h, sps) < 0)
7285             goto fail;
7286
7287     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7288         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n",
7289                sps_id, sps->profile_idc, sps->level_idc,
7290                sps->poc_type,
7291                sps->ref_frame_count,
7292                sps->mb_width, sps->mb_height,
7293                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7294                sps->direct_8x8_inference_flag ? "8B8" : "",
7295                sps->crop_left, sps->crop_right,
7296                sps->crop_top, sps->crop_bottom,
7297                sps->vui_parameters_present_flag ? "VUI" : "",
7298                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc],
7299                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
7300                sps->timing_info_present_flag ? sps->time_scale : 0
7301                );
7302     }
7303
7304     av_free(h->sps_buffers[sps_id]);
7305     h->sps_buffers[sps_id]= sps;
7306     h->sps = *sps;
7307     return 0;
7308 fail:
7309     av_free(sps);
7310     return -1;
7311 }
7312
7313 static void
7314 build_qp_table(PPS *pps, int t, int index)
7315 {
7316     int i;
7317     for(i = 0; i < 52; i++)
7318         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7319 }
7320
7321 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7322     MpegEncContext * const s = &h->s;
7323     unsigned int pps_id= get_ue_golomb(&s->gb);
7324     PPS *pps;
7325
7326     if(pps_id >= MAX_PPS_COUNT) {
7327         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7328         return -1;
7329     }
7330
7331     pps= av_mallocz(sizeof(PPS));
7332     if(pps == NULL)
7333         return -1;
7334     pps->sps_id= get_ue_golomb_31(&s->gb);
7335     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7336         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7337         goto fail;
7338     }
7339
7340     pps->cabac= get_bits1(&s->gb);
7341     pps->pic_order_present= get_bits1(&s->gb);
7342     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7343     if(pps->slice_group_count > 1 ){
7344         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7345         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7346         switch(pps->mb_slice_group_map_type){
7347         case 0:
7348 #if 0
7349 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7350 |    run_length[ i ]                                |1  |ue(v)   |
7351 #endif
7352             break;
7353         case 2:
7354 #if 0
7355 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7356 |{                                                  |   |        |
7357 |    top_left_mb[ i ]                               |1  |ue(v)   |
7358 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7359 |   }                                               |   |        |
7360 #endif
7361             break;
7362         case 3:
7363         case 4:
7364         case 5:
7365 #if 0
7366 |   slice_group_change_direction_flag               |1  |u(1)    |
7367 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7368 #endif
7369             break;
7370         case 6:
7371 #if 0
7372 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7373 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7374 |)                                                  |   |        |
7375 |    slice_group_id[ i ]                            |1  |u(v)    |
7376 #endif
7377             break;
7378         }
7379     }
7380     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7381     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7382     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7383         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7384         goto fail;
7385     }
7386
7387     pps->weighted_pred= get_bits1(&s->gb);
7388     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7389     pps->init_qp= get_se_golomb(&s->gb) + 26;
7390     pps->init_qs= get_se_golomb(&s->gb) + 26;
7391     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7392     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7393     pps->constrained_intra_pred= get_bits1(&s->gb);
7394     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7395
7396     pps->transform_8x8_mode= 0;
7397     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7398     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7399     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7400
7401     if(get_bits_count(&s->gb) < bit_length){
7402         pps->transform_8x8_mode= get_bits1(&s->gb);
7403         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7404         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7405     } else {
7406         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7407     }
7408
7409     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7410     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7411     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7412         h->pps.chroma_qp_diff= 1;
7413
7414     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7415         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7416                pps_id, pps->sps_id,
7417                pps->cabac ? "CABAC" : "CAVLC",
7418                pps->slice_group_count,
7419                pps->ref_count[0], pps->ref_count[1],
7420                pps->weighted_pred ? "weighted" : "",
7421                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7422                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7423                pps->constrained_intra_pred ? "CONSTR" : "",
7424                pps->redundant_pic_cnt_present ? "REDU" : "",
7425                pps->transform_8x8_mode ? "8x8DCT" : ""
7426                );
7427     }
7428
7429     av_free(h->pps_buffers[pps_id]);
7430     h->pps_buffers[pps_id]= pps;
7431     return 0;
7432 fail:
7433     av_free(pps);
7434     return -1;
7435 }
7436
7437 /**
7438  * Call decode_slice() for each context.
7439  *
7440  * @param h h264 master context
7441  * @param context_count number of contexts to execute
7442  */
7443 static void execute_decode_slices(H264Context *h, int context_count){
7444     MpegEncContext * const s = &h->s;
7445     AVCodecContext * const avctx= s->avctx;
7446     H264Context *hx;
7447     int i;
7448
7449     if (s->avctx->hwaccel)
7450         return;
7451     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7452         return;
7453     if(context_count == 1) {
7454         decode_slice(avctx, &h);
7455     } else {
7456         for(i = 1; i < context_count; i++) {
7457             hx = h->thread_context[i];
7458             hx->s.error_recognition = avctx->error_recognition;
7459             hx->s.error_count = 0;
7460         }
7461
7462         avctx->execute(avctx, (void *)decode_slice,
7463                        h->thread_context, NULL, context_count, sizeof(void*));
7464
7465         /* pull back stuff from slices to master context */
7466         hx = h->thread_context[context_count - 1];
7467         s->mb_x = hx->s.mb_x;
7468         s->mb_y = hx->s.mb_y;
7469         s->dropable = hx->s.dropable;
7470         s->picture_structure = hx->s.picture_structure;
7471         for(i = 1; i < context_count; i++)
7472             h->s.error_count += h->thread_context[i]->s.error_count;
7473     }
7474 }
7475
7476
7477 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7478     MpegEncContext * const s = &h->s;
7479     AVCodecContext * const avctx= s->avctx;
7480     int buf_index=0;
7481     H264Context *hx; ///< thread context
7482     int context_count = 0;
7483     int next_avc= h->is_avc ? 0 : buf_size;
7484
7485     h->max_contexts = avctx->thread_count;
7486 #if 0
7487     int i;
7488     for(i=0; i<50; i++){
7489         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7490     }
7491 #endif
7492     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7493         h->current_slice = 0;
7494         if (!s->first_field)
7495             s->current_picture_ptr= NULL;
7496         reset_sei(h);
7497     }
7498
7499     for(;;){
7500         int consumed;
7501         int dst_length;
7502         int bit_length;
7503         const uint8_t *ptr;
7504         int i, nalsize = 0;
7505         int err;
7506
7507         if(buf_index >= next_avc) {
7508             if(buf_index >= buf_size) break;
7509             nalsize = 0;
7510             for(i = 0; i < h->nal_length_size; i++)
7511                 nalsize = (nalsize << 8) | buf[buf_index++];
7512             if(nalsize <= 1 || nalsize > buf_size - buf_index){
7513                 if(nalsize == 1){
7514                     buf_index++;
7515                     continue;
7516                 }else{
7517                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7518                     break;
7519                 }
7520             }
7521             next_avc= buf_index + nalsize;
7522         } else {
7523             // start code prefix search
7524             for(; buf_index + 3 < buf_size; buf_index++){
7525                 // This should always succeed in the first iteration.
7526                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7527                     break;
7528             }
7529
7530             if(buf_index+3 >= buf_size) break;
7531
7532             buf_index+=3;
7533         }
7534
7535         hx = h->thread_context[context_count];
7536
7537         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, next_avc - buf_index);
7538         if (ptr==NULL || dst_length < 0){
7539             return -1;
7540         }
7541         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7542             dst_length--;
7543         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7544
7545         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7546             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7547         }
7548
7549         if (h->is_avc && (nalsize != consumed) && nalsize){
7550             int i, debug_level = AV_LOG_DEBUG;
7551             for (i = consumed; i < nalsize; i++)
7552                 if (buf[buf_index+i])
7553                     debug_level = AV_LOG_ERROR;
7554             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7555         }
7556
7557         buf_index += consumed;
7558
7559         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7560            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7561             continue;
7562
7563       again:
7564         err = 0;
7565         switch(hx->nal_unit_type){
7566         case NAL_IDR_SLICE:
7567             if (h->nal_unit_type != NAL_IDR_SLICE) {
7568                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7569                 return -1;
7570             }
7571             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7572         case NAL_SLICE:
7573             init_get_bits(&hx->s.gb, ptr, bit_length);
7574             hx->intra_gb_ptr=
7575             hx->inter_gb_ptr= &hx->s.gb;
7576             hx->s.data_partitioning = 0;
7577
7578             if((err = decode_slice_header(hx, h)))
7579                break;
7580
7581             if (s->avctx->hwaccel && h->current_slice == 1) {
7582                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7583                     return -1;
7584             }
7585
7586             s->current_picture_ptr->key_frame |=
7587                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7588                     (h->sei_recovery_frame_cnt >= 0);
7589             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7590                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7591                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7592                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7593                && avctx->skip_frame < AVDISCARD_ALL){
7594                 if(avctx->hwaccel) {
7595                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7596                         return -1;
7597                 }else
7598                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7599                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7600                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7601                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7602                 }else
7603                     context_count++;
7604             }
7605             break;
7606         case NAL_DPA:
7607             init_get_bits(&hx->s.gb, ptr, bit_length);
7608             hx->intra_gb_ptr=
7609             hx->inter_gb_ptr= NULL;
7610
7611             if ((err = decode_slice_header(hx, h)) < 0)
7612                 break;
7613
7614             hx->s.data_partitioning = 1;
7615
7616             break;
7617         case NAL_DPB:
7618             init_get_bits(&hx->intra_gb, ptr, bit_length);
7619             hx->intra_gb_ptr= &hx->intra_gb;
7620             break;
7621         case NAL_DPC:
7622             init_get_bits(&hx->inter_gb, ptr, bit_length);
7623             hx->inter_gb_ptr= &hx->inter_gb;
7624
7625             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7626                && s->context_initialized
7627                && s->hurry_up < 5
7628                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7629                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7630                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7631                && avctx->skip_frame < AVDISCARD_ALL)
7632                 context_count++;
7633             break;
7634         case NAL_SEI:
7635             init_get_bits(&s->gb, ptr, bit_length);
7636             ff_h264_decode_sei(h);
7637             break;
7638         case NAL_SPS:
7639             init_get_bits(&s->gb, ptr, bit_length);
7640             ff_h264_decode_seq_parameter_set(h);
7641
7642             if(s->flags& CODEC_FLAG_LOW_DELAY)
7643                 s->low_delay=1;
7644
7645             if(avctx->has_b_frames < 2)
7646                 avctx->has_b_frames= !s->low_delay;
7647             break;
7648         case NAL_PPS:
7649             init_get_bits(&s->gb, ptr, bit_length);
7650
7651             ff_h264_decode_picture_parameter_set(h, bit_length);
7652
7653             break;
7654         case NAL_AUD:
7655         case NAL_END_SEQUENCE:
7656         case NAL_END_STREAM:
7657         case NAL_FILLER_DATA:
7658         case NAL_SPS_EXT:
7659         case NAL_AUXILIARY_SLICE:
7660             break;
7661         default:
7662             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7663         }
7664
7665         if(context_count == h->max_contexts) {
7666             execute_decode_slices(h, context_count);
7667             context_count = 0;
7668         }
7669
7670         if (err < 0)
7671             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7672         else if(err == 1) {
7673             /* Slice could not be decoded in parallel mode, copy down
7674              * NAL unit stuff to context 0 and restart. Note that
7675              * rbsp_buffer is not transferred, but since we no longer
7676              * run in parallel mode this should not be an issue. */
7677             h->nal_unit_type = hx->nal_unit_type;
7678             h->nal_ref_idc   = hx->nal_ref_idc;
7679             hx = h;
7680             goto again;
7681         }
7682     }
7683     if(context_count)
7684         execute_decode_slices(h, context_count);
7685     return buf_index;
7686 }
7687
7688 /**
7689  * returns the number of bytes consumed for building the current frame
7690  */
7691 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7692         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7693         if(pos+10>buf_size) pos=buf_size; // oops ;)
7694
7695         return pos;
7696 }
7697
7698 static int decode_frame(AVCodecContext *avctx,
7699                              void *data, int *data_size,
7700                              AVPacket *avpkt)
7701 {
7702     const uint8_t *buf = avpkt->data;
7703     int buf_size = avpkt->size;
7704     H264Context *h = avctx->priv_data;
7705     MpegEncContext *s = &h->s;
7706     AVFrame *pict = data;
7707     int buf_index;
7708
7709     s->flags= avctx->flags;
7710     s->flags2= avctx->flags2;
7711
7712    /* end of stream, output what is still in the buffers */
7713     if (buf_size == 0) {
7714         Picture *out;
7715         int i, out_idx;
7716
7717 //FIXME factorize this with the output code below
7718         out = h->delayed_pic[0];
7719         out_idx = 0;
7720         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
7721             if(h->delayed_pic[i]->poc < out->poc){
7722                 out = h->delayed_pic[i];
7723                 out_idx = i;
7724             }
7725
7726         for(i=out_idx; h->delayed_pic[i]; i++)
7727             h->delayed_pic[i] = h->delayed_pic[i+1];
7728
7729         if(out){
7730             *data_size = sizeof(AVFrame);
7731             *pict= *(AVFrame*)out;
7732         }
7733
7734         return 0;
7735     }
7736
7737     if(h->is_avc && !h->got_avcC) {
7738         int i, cnt, nalsize;
7739         unsigned char *p = avctx->extradata;
7740         if(avctx->extradata_size < 7) {
7741             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7742             return -1;
7743         }
7744         if(*p != 1) {
7745             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7746             return -1;
7747         }
7748         /* sps and pps in the avcC always have length coded with 2 bytes,
7749            so put a fake nal_length_size = 2 while parsing them */
7750         h->nal_length_size = 2;
7751         // Decode sps from avcC
7752         cnt = *(p+5) & 0x1f; // Number of sps
7753         p += 6;
7754         for (i = 0; i < cnt; i++) {
7755             nalsize = AV_RB16(p) + 2;
7756             if(decode_nal_units(h, p, nalsize) < 0) {
7757                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7758                 return -1;
7759             }
7760             p += nalsize;
7761         }
7762         // Decode pps from avcC
7763         cnt = *(p++); // Number of pps
7764         for (i = 0; i < cnt; i++) {
7765             nalsize = AV_RB16(p) + 2;
7766             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7767                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7768                 return -1;
7769             }
7770             p += nalsize;
7771         }
7772         // Now store right nal length size, that will be use to parse all other nals
7773         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7774         // Do not reparse avcC
7775         h->got_avcC = 1;
7776     }
7777
7778     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7779         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7780             return -1;
7781         h->got_avcC = 1;
7782     }
7783
7784     buf_index=decode_nal_units(h, buf, buf_size);
7785     if(buf_index < 0)
7786         return -1;
7787
7788     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7789         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7790         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7791         return -1;
7792     }
7793
7794     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7795         Picture *out = s->current_picture_ptr;
7796         Picture *cur = s->current_picture_ptr;
7797         int i, pics, out_of_order, out_idx;
7798
7799         field_end(h);
7800
7801         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7802             /* Wait for second field. */
7803             *data_size = 0;
7804
7805         } else {
7806             cur->interlaced_frame = 0;
7807             cur->repeat_pict = 0;
7808
7809             /* Signal interlacing information externally. */
7810             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7811
7812             if(h->sps.pic_struct_present_flag){
7813                 switch (h->sei_pic_struct)
7814                 {
7815                 case SEI_PIC_STRUCT_FRAME:
7816                     break;
7817                 case SEI_PIC_STRUCT_TOP_FIELD:
7818                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7819                     cur->interlaced_frame = 1;
7820                     break;
7821                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7822                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7823                     if (FIELD_OR_MBAFF_PICTURE)
7824                         cur->interlaced_frame = 1;
7825                     else
7826                         // try to flag soft telecine progressive
7827                         cur->interlaced_frame = h->prev_interlaced_frame;
7828                     break;
7829                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7830                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7831                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7832                     // From these hints, let the applications decide if they apply deinterlacing.
7833                     cur->repeat_pict = 1;
7834                     break;
7835                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7836                     // Force progressive here, as doubling interlaced frame is a bad idea.
7837                     cur->repeat_pict = 2;
7838                     break;
7839                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7840                     cur->repeat_pict = 4;
7841                     break;
7842                 }
7843
7844                 if ((h->sei_ct_type & 3) && h->sei_pic_struct <= SEI_PIC_STRUCT_BOTTOM_TOP)
7845                     cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
7846             }else{
7847                 /* Derive interlacing flag from used decoding process. */
7848                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7849             }
7850             h->prev_interlaced_frame = cur->interlaced_frame;
7851
7852             if (cur->field_poc[0] != cur->field_poc[1]){
7853                 /* Derive top_field_first from field pocs. */
7854                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7855             }else{
7856                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7857                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7858                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7859                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7860                         cur->top_field_first = 1;
7861                     else
7862                         cur->top_field_first = 0;
7863                 }else{
7864                     /* Most likely progressive */
7865                     cur->top_field_first = 0;
7866                 }
7867             }
7868
7869         //FIXME do something with unavailable reference frames
7870
7871             /* Sort B-frames into display order */
7872
7873             if(h->sps.bitstream_restriction_flag
7874                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7875                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7876                 s->low_delay = 0;
7877             }
7878
7879             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7880                && !h->sps.bitstream_restriction_flag){
7881                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7882                 s->low_delay= 0;
7883             }
7884
7885             pics = 0;
7886             while(h->delayed_pic[pics]) pics++;
7887
7888             assert(pics <= MAX_DELAYED_PIC_COUNT);
7889
7890             h->delayed_pic[pics++] = cur;
7891             if(cur->reference == 0)
7892                 cur->reference = DELAYED_PIC_REF;
7893
7894             out = h->delayed_pic[0];
7895             out_idx = 0;
7896             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
7897                 if(h->delayed_pic[i]->poc < out->poc){
7898                     out = h->delayed_pic[i];
7899                     out_idx = i;
7900                 }
7901             if(s->avctx->has_b_frames == 0 && (h->delayed_pic[0]->key_frame || h->delayed_pic[0]->mmco_reset))
7902                 h->outputed_poc= INT_MIN;
7903             out_of_order = out->poc < h->outputed_poc;
7904
7905             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7906                 { }
7907             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7908                || (s->low_delay &&
7909                 ((h->outputed_poc != INT_MIN && out->poc > h->outputed_poc + 2)
7910                  || cur->pict_type == FF_B_TYPE)))
7911             {
7912                 s->low_delay = 0;
7913                 s->avctx->has_b_frames++;
7914             }
7915
7916             if(out_of_order || pics > s->avctx->has_b_frames){
7917                 out->reference &= ~DELAYED_PIC_REF;
7918                 for(i=out_idx; h->delayed_pic[i]; i++)
7919                     h->delayed_pic[i] = h->delayed_pic[i+1];
7920             }
7921             if(!out_of_order && pics > s->avctx->has_b_frames){
7922                 *data_size = sizeof(AVFrame);
7923
7924                 if(out_idx==0 && h->delayed_pic[0] && (h->delayed_pic[0]->key_frame || h->delayed_pic[0]->mmco_reset)) {
7925                     h->outputed_poc = INT_MIN;
7926                 } else
7927                     h->outputed_poc = out->poc;
7928                 *pict= *(AVFrame*)out;
7929             }else{
7930                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7931             }
7932         }
7933     }
7934
7935     assert(pict->data[0] || !*data_size);
7936     ff_print_debug_info(s, pict);
7937 //printf("out %d\n", (int)pict->data[0]);
7938
7939     return get_consumed_bytes(s, buf_index, buf_size);
7940 }
7941 #if 0
7942 static inline void fill_mb_avail(H264Context *h){
7943     MpegEncContext * const s = &h->s;
7944     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7945
7946     if(s->mb_y){
7947         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7948         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7949         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7950     }else{
7951         h->mb_avail[0]=
7952         h->mb_avail[1]=
7953         h->mb_avail[2]= 0;
7954     }
7955     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7956     h->mb_avail[4]= 1; //FIXME move out
7957     h->mb_avail[5]= 0; //FIXME move out
7958 }
7959 #endif
7960
7961 #ifdef TEST
7962 #undef printf
7963 #undef random
7964 #define COUNT 8000
7965 #define SIZE (COUNT*40)
7966 int main(void){
7967     int i;
7968     uint8_t temp[SIZE];
7969     PutBitContext pb;
7970     GetBitContext gb;
7971 //    int int_temp[10000];
7972     DSPContext dsp;
7973     AVCodecContext avctx;
7974
7975     dsputil_init(&dsp, &avctx);
7976
7977     init_put_bits(&pb, temp, SIZE);
7978     printf("testing unsigned exp golomb\n");
7979     for(i=0; i<COUNT; i++){
7980         START_TIMER
7981         set_ue_golomb(&pb, i);
7982         STOP_TIMER("set_ue_golomb");
7983     }
7984     flush_put_bits(&pb);
7985
7986     init_get_bits(&gb, temp, 8*SIZE);
7987     for(i=0; i<COUNT; i++){
7988         int j, s;
7989
7990         s= show_bits(&gb, 24);
7991
7992         START_TIMER
7993         j= get_ue_golomb(&gb);
7994         if(j != i){
7995             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7996 //            return -1;
7997         }
7998         STOP_TIMER("get_ue_golomb");
7999     }
8000
8001
8002     init_put_bits(&pb, temp, SIZE);
8003     printf("testing signed exp golomb\n");
8004     for(i=0; i<COUNT; i++){
8005         START_TIMER
8006         set_se_golomb(&pb, i - COUNT/2);
8007         STOP_TIMER("set_se_golomb");
8008     }
8009     flush_put_bits(&pb);
8010
8011     init_get_bits(&gb, temp, 8*SIZE);
8012     for(i=0; i<COUNT; i++){
8013         int j, s;
8014
8015         s= show_bits(&gb, 24);
8016
8017         START_TIMER
8018         j= get_se_golomb(&gb);
8019         if(j != i - COUNT/2){
8020             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8021 //            return -1;
8022         }
8023         STOP_TIMER("get_se_golomb");
8024     }
8025
8026 #if 0
8027     printf("testing 4x4 (I)DCT\n");
8028
8029     DCTELEM block[16];
8030     uint8_t src[16], ref[16];
8031     uint64_t error= 0, max_error=0;
8032
8033     for(i=0; i<COUNT; i++){
8034         int j;
8035 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8036         for(j=0; j<16; j++){
8037             ref[j]= random()%255;
8038             src[j]= random()%255;
8039         }
8040
8041         h264_diff_dct_c(block, src, ref, 4);
8042
8043         //normalize
8044         for(j=0; j<16; j++){
8045 //            printf("%d ", block[j]);
8046             block[j]= block[j]*4;
8047             if(j&1) block[j]= (block[j]*4 + 2)/5;
8048             if(j&4) block[j]= (block[j]*4 + 2)/5;
8049         }
8050 //        printf("\n");
8051
8052         s->dsp.h264_idct_add(ref, block, 4);
8053 /*        for(j=0; j<16; j++){
8054             printf("%d ", ref[j]);
8055         }
8056         printf("\n");*/
8057
8058         for(j=0; j<16; j++){
8059             int diff= FFABS(src[j] - ref[j]);
8060
8061             error+= diff*diff;
8062             max_error= FFMAX(max_error, diff);
8063         }
8064     }
8065     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8066     printf("testing quantizer\n");
8067     for(qp=0; qp<52; qp++){
8068         for(i=0; i<16; i++)
8069             src1_block[i]= src2_block[i]= random()%255;
8070
8071     }
8072     printf("Testing NAL layer\n");
8073
8074     uint8_t bitstream[COUNT];
8075     uint8_t nal[COUNT*2];
8076     H264Context h;
8077     memset(&h, 0, sizeof(H264Context));
8078
8079     for(i=0; i<COUNT; i++){
8080         int zeros= i;
8081         int nal_length;
8082         int consumed;
8083         int out_length;
8084         uint8_t *out;
8085         int j;
8086
8087         for(j=0; j<COUNT; j++){
8088             bitstream[j]= (random() % 255) + 1;
8089         }
8090
8091         for(j=0; j<zeros; j++){
8092             int pos= random() % COUNT;
8093             while(bitstream[pos] == 0){
8094                 pos++;
8095                 pos %= COUNT;
8096             }
8097             bitstream[pos]=0;
8098         }
8099
8100         START_TIMER
8101
8102         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8103         if(nal_length<0){
8104             printf("encoding failed\n");
8105             return -1;
8106         }
8107
8108         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8109
8110         STOP_TIMER("NAL")
8111
8112         if(out_length != COUNT){
8113             printf("incorrect length %d %d\n", out_length, COUNT);
8114             return -1;
8115         }
8116
8117         if(consumed != nal_length){
8118             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8119             return -1;
8120         }
8121
8122         if(memcmp(bitstream, out, COUNT)){
8123             printf("mismatch\n");
8124             return -1;
8125         }
8126     }
8127 #endif
8128
8129     printf("Testing RBSP\n");
8130
8131
8132     return 0;
8133 }
8134 #endif /* TEST */
8135
8136
8137 av_cold void ff_h264_free_context(H264Context *h)
8138 {
8139     int i;
8140
8141     free_tables(h); //FIXME cleanup init stuff perhaps
8142
8143     for(i = 0; i < MAX_SPS_COUNT; i++)
8144         av_freep(h->sps_buffers + i);
8145
8146     for(i = 0; i < MAX_PPS_COUNT; i++)
8147         av_freep(h->pps_buffers + i);
8148 }
8149
8150 static av_cold int decode_end(AVCodecContext *avctx)
8151 {
8152     H264Context *h = avctx->priv_data;
8153     MpegEncContext *s = &h->s;
8154
8155     ff_h264_free_context(h);
8156
8157     MPV_common_end(s);
8158
8159 //    memset(h, 0, sizeof(H264Context));
8160
8161     return 0;
8162 }
8163
8164
8165 AVCodec h264_decoder = {
8166     "h264",
8167     CODEC_TYPE_VIDEO,
8168     CODEC_ID_H264,
8169     sizeof(H264Context),
8170     decode_init,
8171     NULL,
8172     decode_end,
8173     decode_frame,
8174     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8175     .flush= flush_dpb,
8176     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8177     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8178 };
8179
8180 #if CONFIG_H264_VDPAU_DECODER
8181 AVCodec h264_vdpau_decoder = {
8182     "h264_vdpau",
8183     CODEC_TYPE_VIDEO,
8184     CODEC_ID_H264,
8185     sizeof(H264Context),
8186     decode_init,
8187     NULL,
8188     decode_end,
8189     decode_frame,
8190     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8191     .flush= flush_dpb,
8192     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8193     .pix_fmts = (const enum PixelFormat[]){PIX_FMT_VDPAU_H264, PIX_FMT_NONE},
8194 };
8195 #endif
8196
8197 #if CONFIG_SVQ3_DECODER
8198 #include "svq3.c"
8199 #endif