libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #ifdef WORDS_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996     assert(h->ref_list[1][0].reference&3);
 997
 998 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 999
1000     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1001         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1002             int cur_poc = s->current_picture_ptr->poc;
1003             int *col_poc = h->ref_list[1]->field_poc;
1004             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1005             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1006             b8_stride = 0;
1007         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1008             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1009             mb_xy += s->mb_stride*fieldoff;
1010         }
1011         goto single_col;
1012     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1013         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1014             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1016             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1017             b8_stride *= 3;
1018             b4_stride *= 6;
1019             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1020             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1021                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1022                 && !is_b8x8){
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1025             }else{
1026                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1027                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1028             }
1029         }else{                                           //     AFR/FR    -> AFR/FR
1030 single_col:
1031             mb_type_col[0] =
1032             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1033             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1034                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1035                 * so we know exactly what block size to use */
1036                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1037                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1038             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1041             }else{
1042                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1043                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1044             }
1045         }
1046     }
1047
1048     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1049     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1050     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1051     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1052     if(!b8_stride){
1053         if(s->mb_y&1){
1054             l1ref0 += h->b8_stride;
1055             l1ref1 += h->b8_stride;
1056             l1mv0  +=  2*b4_stride;
1057             l1mv1  +=  2*b4_stride;
1058         }
1059     }
1060
1061     if(h->direct_spatial_mv_pred){
1062         int ref[2];
1063         int mv[2][2];
1064         int list;
1065
1066         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1067
1068         /* ref = min(neighbors) */
1069         for(list=0; list<2; list++){
1070             int refa = h->ref_cache[list][scan8[0] - 1];
1071             int refb = h->ref_cache[list][scan8[0] - 8];
1072             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1073             if(refc == PART_NOT_AVAILABLE)
1074                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1075             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1076             if(ref[list] < 0)
1077                 ref[list] = -1;
1078         }
1079
1080         if(ref[0] < 0 && ref[1] < 0){
1081             ref[0] = ref[1] = 0;
1082             mv[0][0] = mv[0][1] =
1083             mv[1][0] = mv[1][1] = 0;
1084         }else{
1085             for(list=0; list<2; list++){
1086                 if(ref[list] >= 0)
1087                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1088                 else
1089                     mv[list][0] = mv[list][1] = 0;
1090             }
1091         }
1092
1093         if(ref[1] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L1;
1096             sub_mb_type &= ~MB_TYPE_L1;
1097         }else if(ref[0] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L0;
1100             sub_mb_type &= ~MB_TYPE_L0;
1101         }
1102
1103         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1104             for(i8=0; i8<4; i8++){
1105                 int x8 = i8&1;
1106                 int y8 = i8>>1;
1107                 int xy8 = x8+y8*b8_stride;
1108                 int xy4 = 3*x8+y8*b4_stride;
1109                 int a=0, b=0;
1110
1111                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1112                     continue;
1113                 h->sub_mb_type[i8] = sub_mb_type;
1114
1115                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1116                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1117                 if(!IS_INTRA(mb_type_col[y8])
1118                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1119                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1120                     if(ref[0] > 0)
1121                         a= pack16to32(mv[0][0],mv[0][1]);
1122                     if(ref[1] > 0)
1123                         b= pack16to32(mv[1][0],mv[1][1]);
1124                 }else{
1125                     a= pack16to32(mv[0][0],mv[0][1]);
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127                 }
1128                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1129                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1130             }
1131         }else if(IS_16X16(*mb_type)){
1132             int a=0, b=0;
1133
1134             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1135             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1136             if(!IS_INTRA(mb_type_col[0])
1137                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1138                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1139                        && (h->x264_build>33 || !h->x264_build)))){
1140                 if(ref[0] > 0)
1141                     a= pack16to32(mv[0][0],mv[0][1]);
1142                 if(ref[1] > 0)
1143                     b= pack16to32(mv[1][0],mv[1][1]);
1144             }else{
1145                 a= pack16to32(mv[0][0],mv[0][1]);
1146                 b= pack16to32(mv[1][0],mv[1][1]);
1147             }
1148             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1149             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1150         }else{
1151             for(i8=0; i8<4; i8++){
1152                 const int x8 = i8&1;
1153                 const int y8 = i8>>1;
1154
1155                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1156                     continue;
1157                 h->sub_mb_type[i8] = sub_mb_type;
1158
1159                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1160                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1161                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1162                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1163
1164                 /* col_zero_flag */
1165                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1166                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1167                                                   && (h->x264_build>33 || !h->x264_build)))){
1168                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1169                     if(IS_SUB_8X8(sub_mb_type)){
1170                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1171                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1172                             if(ref[0] == 0)
1173                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                             if(ref[1] == 0)
1175                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1176                         }
1177                     }else
1178                     for(i4=0; i4<4; i4++){
1179                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1180                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1181                             if(ref[0] == 0)
1182                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1183                             if(ref[1] == 0)
1184                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1185                         }
1186                     }
1187                 }
1188             }
1189         }
1190     }else{ /* direct temporal mv pred */
1191         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1192         const int *dist_scale_factor = h->dist_scale_factor;
1193         int ref_offset= 0;
1194
1195         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1196             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1197             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1198             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1199         }
1200         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1201             ref_offset += 16;
1202
1203         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1204             /* FIXME assumes direct_8x8_inference == 1 */
1205             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1206
1207             for(i8=0; i8<4; i8++){
1208                 const int x8 = i8&1;
1209                 const int y8 = i8>>1;
1210                 int ref0, scale;
1211                 const int16_t (*l1mv)[2]= l1mv0;
1212
1213                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1214                     continue;
1215                 h->sub_mb_type[i8] = sub_mb_type;
1216
1217                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                 if(IS_INTRA(mb_type_col[y8])){
1219                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1221                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1222                     continue;
1223                 }
1224
1225                 ref0 = l1ref0[x8 + y8*b8_stride];
1226                 if(ref0 >= 0)
1227                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1228                 else{
1229                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1230                     l1mv= l1mv1;
1231                 }
1232                 scale = dist_scale_factor[ref0];
1233                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1234
1235                 {
1236                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1237                     int my_col = (mv_col[1]<<y_shift)/2;
1238                     int mx = (scale * mv_col[0] + 128) >> 8;
1239                     int my = (scale * my_col + 128) >> 8;
1240                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1241                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1242                 }
1243             }
1244             return;
1245         }
1246
1247         /* one-to-one mv scaling */
1248
1249         if(IS_16X16(*mb_type)){
1250             int ref, mv0, mv1;
1251
1252             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1253             if(IS_INTRA(mb_type_col[0])){
1254                 ref=mv0=mv1=0;
1255             }else{
1256                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1257                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1258                 const int scale = dist_scale_factor[ref0];
1259                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1260                 int mv_l0[2];
1261                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1262                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1263                 ref= ref0;
1264                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1265                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1266             }
1267             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1268             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1269             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1270         }else{
1271             for(i8=0; i8<4; i8++){
1272                 const int x8 = i8&1;
1273                 const int y8 = i8>>1;
1274                 int ref0, scale;
1275                 const int16_t (*l1mv)[2]= l1mv0;
1276
1277                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1278                     continue;
1279                 h->sub_mb_type[i8] = sub_mb_type;
1280                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                 if(IS_INTRA(mb_type_col[0])){
1282                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1283                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1284                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1285                     continue;
1286                 }
1287
1288                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1289                 if(ref0 >= 0)
1290                     ref0 = map_col_to_list0[0][ref0];
1291                 else{
1292                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1293                     l1mv= l1mv1;
1294                 }
1295                 scale = dist_scale_factor[ref0];
1296
1297                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1298                 if(IS_SUB_8X8(sub_mb_type)){
1299                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1300                     int mx = (scale * mv_col[0] + 128) >> 8;
1301                     int my = (scale * mv_col[1] + 128) >> 8;
1302                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1303                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1304                 }else
1305                 for(i4=0; i4<4; i4++){
1306                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1307                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1308                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1309                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1310                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1311                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1312                 }
1313             }
1314         }
1315     }
1316 }
1317
1318 static inline void write_back_motion(H264Context *h, int mb_type){
1319     MpegEncContext * const s = &h->s;
1320     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1321     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1322     int list;
1323
1324     if(!USES_LIST(mb_type, 0))
1325         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1326
1327     for(list=0; list<h->list_count; list++){
1328         int y;
1329         if(!USES_LIST(mb_type, list))
1330             continue;
1331
1332         for(y=0; y<4; y++){
1333             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1334             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1335         }
1336         if( h->pps.cabac ) {
1337             if(IS_SKIP(mb_type))
1338                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1339             else
1340             for(y=0; y<4; y++){
1341                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1342                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1343             }
1344         }
1345
1346         {
1347             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1348             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1349             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1350             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1351             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1352         }
1353     }
1354
1355     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1356         if(IS_8X8(mb_type)){
1357             uint8_t *direct_table = &h->direct_table[b8_xy];
1358             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1359             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1360             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1361         }
1362     }
1363 }
1364
1365 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1366     int i, si, di;
1367     uint8_t *dst;
1368     int bufidx;
1369
1370 //    src[0]&0x80;                //forbidden bit
1371     h->nal_ref_idc= src[0]>>5;
1372     h->nal_unit_type= src[0]&0x1F;
1373
1374     src++; length--;
1375 #if 0
1376     for(i=0; i<length; i++)
1377         printf("%2X ", src[i]);
1378 #endif
1379
1380 #if HAVE_FAST_UNALIGNED
1381 # if HAVE_FAST_64BIT
1382 #   define RS 7
1383     for(i=0; i+1<length; i+=9){
1384         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1385 # else
1386 #   define RS 3
1387     for(i=0; i+1<length; i+=5){
1388         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1389 # endif
1390             continue;
1391         if(i>0 && !src[i]) i--;
1392         while(src[i]) i++;
1393 #else
1394 #   define RS 0
1395     for(i=0; i+1<length; i+=2){
1396         if(src[i]) continue;
1397         if(i>0 && src[i-1]==0) i--;
1398 #endif
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406         i-= RS;
1407     }
1408
1409     if(i>=length-1){ //no escaped 0
1410         *dst_length= length;
1411         *consumed= length+1; //+1 for the header
1412         return src;
1413     }
1414
1415     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1416     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1417     dst= h->rbsp_buffer[bufidx];
1418
1419     if (dst == NULL){
1420         return NULL;
1421     }
1422
1423 //printf("decoding esc\n");
1424     memcpy(dst, src, i);
1425     si=di=i;
1426     while(si+2<length){
1427         //remove escapes (very rare 1:2^22)
1428         if(src[si+2]>3){
1429             dst[di++]= src[si++];
1430             dst[di++]= src[si++];
1431         }else if(src[si]==0 && src[si+1]==0){
1432             if(src[si+2]==3){ //escape
1433                 dst[di++]= 0;
1434                 dst[di++]= 0;
1435                 si+=3;
1436                 continue;
1437             }else //next start code
1438                 goto nsc;
1439         }
1440
1441         dst[di++]= src[si++];
1442     }
1443     while(si<length)
1444         dst[di++]= src[si++];
1445 nsc:
1446
1447     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1448
1449     *dst_length= di;
1450     *consumed= si + 1;//+1 for the header
1451 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1452     return dst;
1453 }
1454
1455 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1456     int v= *src;
1457     int r;
1458
1459     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1460
1461     for(r=1; r<9; r++){
1462         if(v&1) return r;
1463         v>>=1;
1464     }
1465     return 0;
1466 }
1467
1468 /**
1469  * IDCT transforms the 16 dc values and dequantizes them.
1470  * @param qp quantization parameter
1471  */
1472 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1473 #define stride 16
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479 //memset(block, 64, 2*256);
1480 //return;
1481     for(i=0; i<4; i++){
1482         const int offset= y_offset[i];
1483         const int z0= block[offset+stride*0] + block[offset+stride*4];
1484         const int z1= block[offset+stride*0] - block[offset+stride*4];
1485         const int z2= block[offset+stride*1] - block[offset+stride*5];
1486         const int z3= block[offset+stride*1] + block[offset+stride*5];
1487
1488         temp[4*i+0]= z0+z3;
1489         temp[4*i+1]= z1+z2;
1490         temp[4*i+2]= z1-z2;
1491         temp[4*i+3]= z0-z3;
1492     }
1493
1494     for(i=0; i<4; i++){
1495         const int offset= x_offset[i];
1496         const int z0= temp[4*0+i] + temp[4*2+i];
1497         const int z1= temp[4*0+i] - temp[4*2+i];
1498         const int z2= temp[4*1+i] - temp[4*3+i];
1499         const int z3= temp[4*1+i] + temp[4*3+i];
1500
1501         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1502         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1503         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1504         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1505     }
1506 }
1507
1508 #if 0
1509 /**
1510  * DCT transforms the 16 dc values.
1511  * @param qp quantization parameter ??? FIXME
1512  */
1513 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1514 //    const int qmul= dequant_coeff[qp][0];
1515     int i;
1516     int temp[16]; //FIXME check if this is a good idea
1517     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1518     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1519
1520     for(i=0; i<4; i++){
1521         const int offset= y_offset[i];
1522         const int z0= block[offset+stride*0] + block[offset+stride*4];
1523         const int z1= block[offset+stride*0] - block[offset+stride*4];
1524         const int z2= block[offset+stride*1] - block[offset+stride*5];
1525         const int z3= block[offset+stride*1] + block[offset+stride*5];
1526
1527         temp[4*i+0]= z0+z3;
1528         temp[4*i+1]= z1+z2;
1529         temp[4*i+2]= z1-z2;
1530         temp[4*i+3]= z0-z3;
1531     }
1532
1533     for(i=0; i<4; i++){
1534         const int offset= x_offset[i];
1535         const int z0= temp[4*0+i] + temp[4*2+i];
1536         const int z1= temp[4*0+i] - temp[4*2+i];
1537         const int z2= temp[4*1+i] - temp[4*3+i];
1538         const int z3= temp[4*1+i] + temp[4*3+i];
1539
1540         block[stride*0 +offset]= (z0 + z3)>>1;
1541         block[stride*2 +offset]= (z1 + z2)>>1;
1542         block[stride*8 +offset]= (z1 - z2)>>1;
1543         block[stride*10+offset]= (z0 - z3)>>1;
1544     }
1545 }
1546 #endif
1547
1548 #undef xStride
1549 #undef stride
1550
1551 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1552     const int stride= 16*2;
1553     const int xStride= 16;
1554     int a,b,c,d,e;
1555
1556     a= block[stride*0 + xStride*0];
1557     b= block[stride*0 + xStride*1];
1558     c= block[stride*1 + xStride*0];
1559     d= block[stride*1 + xStride*1];
1560
1561     e= a-b;
1562     a= a+b;
1563     b= c-d;
1564     c= c+d;
1565
1566     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1567     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1568     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1569     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1570 }
1571
1572 #if 0
1573 static void chroma_dc_dct_c(DCTELEM *block){
1574     const int stride= 16*2;
1575     const int xStride= 16;
1576     int a,b,c,d,e;
1577
1578     a= block[stride*0 + xStride*0];
1579     b= block[stride*0 + xStride*1];
1580     c= block[stride*1 + xStride*0];
1581     d= block[stride*1 + xStride*1];
1582
1583     e= a-b;
1584     a= a+b;
1585     b= c-d;
1586     c= c+d;
1587
1588     block[stride*0 + xStride*0]= (a+c);
1589     block[stride*0 + xStride*1]= (e+b);
1590     block[stride*1 + xStride*0]= (a-c);
1591     block[stride*1 + xStride*1]= (e-b);
1592 }
1593 #endif
1594
1595 /**
1596  * gets the chroma qp.
1597  */
1598 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1599     return h->pps.chroma_qp_table[t][qscale];
1600 }
1601
1602 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1603                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1604                            int src_x_offset, int src_y_offset,
1605                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1606     MpegEncContext * const s = &h->s;
1607     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1608     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1609     const int luma_xy= (mx&3) + ((my&3)<<2);
1610     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1611     uint8_t * src_cb, * src_cr;
1612     int extra_width= h->emu_edge_width;
1613     int extra_height= h->emu_edge_height;
1614     int emu=0;
1615     const int full_mx= mx>>2;
1616     const int full_my= my>>2;
1617     const int pic_width  = 16*s->mb_width;
1618     const int pic_height = 16*s->mb_height >> MB_FIELD;
1619
1620     if(mx&7) extra_width -= 3;
1621     if(my&7) extra_height -= 3;
1622
1623     if(   full_mx < 0-extra_width
1624        || full_my < 0-extra_height
1625        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1626        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1627         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1628             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1629         emu=1;
1630     }
1631
1632     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1633     if(!square){
1634         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1635     }
1636
1637     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1638
1639     if(MB_FIELD){
1640         // chroma offset when predicting from a field of opposite parity
1641         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1642         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1643     }
1644     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1645     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1646
1647     if(emu){
1648         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1649             src_cb= s->edge_emu_buffer;
1650     }
1651     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cr= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658 }
1659
1660 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1661                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1662                            int x_offset, int y_offset,
1663                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1664                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1665                            int list0, int list1){
1666     MpegEncContext * const s = &h->s;
1667     qpel_mc_func *qpix_op=  qpix_put;
1668     h264_chroma_mc_func chroma_op= chroma_put;
1669
1670     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1671     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1672     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1673     x_offset += 8*s->mb_x;
1674     y_offset += 8*(s->mb_y >> MB_FIELD);
1675
1676     if(list0){
1677         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681
1682         qpix_op=  qpix_avg;
1683         chroma_op= chroma_avg;
1684     }
1685
1686     if(list1){
1687         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1688         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1689                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1690                            qpix_op, chroma_op);
1691     }
1692 }
1693
1694 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1699                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1700                            int list0, int list1){
1701     MpegEncContext * const s = &h->s;
1702
1703     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1704     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1705     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     x_offset += 8*s->mb_x;
1707     y_offset += 8*(s->mb_y >> MB_FIELD);
1708
1709     if(list0 && list1){
1710         /* don't optimize for luma-only case, since B-frames usually
1711          * use implicit weights => chroma too. */
1712         uint8_t *tmp_cb = s->obmc_scratchpad;
1713         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1714         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1715         int refn0 = h->ref_cache[0][ scan8[n] ];
1716         int refn1 = h->ref_cache[1][ scan8[n] ];
1717
1718         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1719                     dest_y, dest_cb, dest_cr,
1720                     x_offset, y_offset, qpix_put, chroma_put);
1721         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1722                     tmp_y, tmp_cb, tmp_cr,
1723                     x_offset, y_offset, qpix_put, chroma_put);
1724
1725         if(h->use_weight == 2){
1726             int weight0 = h->implicit_weight[refn0][refn1];
1727             int weight1 = 64 - weight0;
1728             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1729             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1730             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1731         }else{
1732             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1733                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1734                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1736                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1737                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1739                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1740                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1741         }
1742     }else{
1743         int list = list1 ? 1 : 0;
1744         int refn = h->ref_cache[list][ scan8[n] ];
1745         Picture *ref= &h->ref_list[list][refn];
1746         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1747                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1748                     qpix_put, chroma_put);
1749
1750         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1751                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1752         if(h->use_weight_chroma){
1753             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1755             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1756                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1757         }
1758     }
1759 }
1760
1761 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1762                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1763                            int x_offset, int y_offset,
1764                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1765                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1766                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1767                            int list0, int list1){
1768     if((h->use_weight==2 && list0 && list1
1769         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1770        || h->use_weight==1)
1771         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1772                          x_offset, y_offset, qpix_put, chroma_put,
1773                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1774     else
1775         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1776                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1777 }
1778
1779 static inline void prefetch_motion(H264Context *h, int list){
1780     /* fetch pixels for estimated mv 4 macroblocks ahead
1781      * optimized for 64byte cache lines */
1782     MpegEncContext * const s = &h->s;
1783     const int refn = h->ref_cache[list][scan8[0]];
1784     if(refn >= 0){
1785         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1786         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1787         uint8_t **src= h->ref_list[list][refn].data;
1788         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1789         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1790         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1791         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1792     }
1793 }
1794
1795 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1796                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1797                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1798                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1799     MpegEncContext * const s = &h->s;
1800     const int mb_xy= h->mb_xy;
1801     const int mb_type= s->current_picture.mb_type[mb_xy];
1802
1803     assert(IS_INTER(mb_type));
1804
1805     prefetch_motion(h, 0);
1806
1807     if(IS_16X16(mb_type)){
1808         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1809                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1810                 &weight_op[0], &weight_avg[0],
1811                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1812     }else if(IS_16X8(mb_type)){
1813         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1815                 &weight_op[1], &weight_avg[1],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1818                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1819                 &weight_op[1], &weight_avg[1],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else if(IS_8X16(mb_type)){
1822         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1823                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1824                 &weight_op[2], &weight_avg[2],
1825                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1826         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1827                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1828                 &weight_op[2], &weight_avg[2],
1829                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1830     }else{
1831         int i;
1832
1833         assert(IS_8X8(mb_type));
1834
1835         for(i=0; i<4; i++){
1836             const int sub_mb_type= h->sub_mb_type[i];
1837             const int n= 4*i;
1838             int x_offset= (i&1)<<2;
1839             int y_offset= (i&2)<<1;
1840
1841             if(IS_SUB_8X8(sub_mb_type)){
1842                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1843                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1844                     &weight_op[3], &weight_avg[3],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_8X4(sub_mb_type)){
1847                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1849                     &weight_op[4], &weight_avg[4],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1852                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1853                     &weight_op[4], &weight_avg[4],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else if(IS_SUB_4X8(sub_mb_type)){
1856                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1857                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1858                     &weight_op[5], &weight_avg[5],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1861                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                     &weight_op[5], &weight_avg[5],
1863                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864             }else{
1865                 int j;
1866                 assert(IS_SUB_4X4(sub_mb_type));
1867                 for(j=0; j<4; j++){
1868                     int sub_x_offset= x_offset + 2*(j&1);
1869                     int sub_y_offset= y_offset +   (j&2);
1870                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1871                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1872                         &weight_op[6], &weight_avg[6],
1873                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1874                 }
1875             }
1876         }
1877     }
1878
1879     prefetch_motion(h, 1);
1880 }
1881
1882 static av_cold void init_cavlc_level_tab(void){
1883     int suffix_length, mask;
1884     unsigned int i;
1885
1886     for(suffix_length=0; suffix_length<7; suffix_length++){
1887         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1888             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1889             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1890
1891             mask= -(level_code&1);
1892             level_code= (((2+level_code)>>1) ^ mask) - mask;
1893             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1894                 cavlc_level_tab[suffix_length][i][0]= level_code;
1895                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1896             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1897                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1898                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1899             }else{
1900                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1901                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1902             }
1903         }
1904     }
1905 }
1906
1907 static av_cold void decode_init_vlc(void){
1908     static int done = 0;
1909
1910     if (!done) {
1911         int i;
1912         int offset;
1913         done = 1;
1914
1915         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1916         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1917         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1918                  &chroma_dc_coeff_token_len [0], 1, 1,
1919                  &chroma_dc_coeff_token_bits[0], 1, 1,
1920                  INIT_VLC_USE_NEW_STATIC);
1921
1922         offset = 0;
1923         for(i=0; i<4; i++){
1924             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1925             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1926             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1927                      &coeff_token_len [i][0], 1, 1,
1928                      &coeff_token_bits[i][0], 1, 1,
1929                      INIT_VLC_USE_NEW_STATIC);
1930             offset += coeff_token_vlc_tables_size[i];
1931         }
1932         /*
1933          * This is a one time safety check to make sure that
1934          * the packed static coeff_token_vlc table sizes
1935          * were initialized correctly.
1936          */
1937         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1938
1939         for(i=0; i<3; i++){
1940             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1941             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1942             init_vlc(&chroma_dc_total_zeros_vlc[i],
1943                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1944                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1945                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1946                      INIT_VLC_USE_NEW_STATIC);
1947         }
1948         for(i=0; i<15; i++){
1949             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1950             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1951             init_vlc(&total_zeros_vlc[i],
1952                      TOTAL_ZEROS_VLC_BITS, 16,
1953                      &total_zeros_len [i][0], 1, 1,
1954                      &total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957
1958         for(i=0; i<6; i++){
1959             run_vlc[i].table = run_vlc_tables[i];
1960             run_vlc[i].table_allocated = run_vlc_tables_size;
1961             init_vlc(&run_vlc[i],
1962                      RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1,
1965                      INIT_VLC_USE_NEW_STATIC);
1966         }
1967         run7_vlc.table = run7_vlc_table,
1968         run7_vlc.table_allocated = run7_vlc_table_size;
1969         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1970                  &run_len [6][0], 1, 1,
1971                  &run_bits[6][0], 1, 1,
1972                  INIT_VLC_USE_NEW_STATIC);
1973
1974         init_cavlc_level_tab();
1975     }
1976 }
1977
1978 static void free_tables(H264Context *h){
1979     int i;
1980     H264Context *hx;
1981     av_freep(&h->intra4x4_pred_mode);
1982     av_freep(&h->chroma_pred_mode_table);
1983     av_freep(&h->cbp_table);
1984     av_freep(&h->mvd_table[0]);
1985     av_freep(&h->mvd_table[1]);
1986     av_freep(&h->direct_table);
1987     av_freep(&h->non_zero_count);
1988     av_freep(&h->slice_table_base);
1989     h->slice_table= NULL;
1990
1991     av_freep(&h->mb2b_xy);
1992     av_freep(&h->mb2b8_xy);
1993
1994     for(i = 0; i < MAX_THREADS; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000     }
2001 }
2002
2003 static void init_dequant8_coeff_table(H264Context *h){
2004     int i,q,x;
2005     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2006     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2007     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2008
2009     for(i=0; i<2; i++ ){
2010         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2011             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2012             break;
2013         }
2014
2015         for(q=0; q<52; q++){
2016             int shift = div6[q];
2017             int idx = rem6[q];
2018             for(x=0; x<64; x++)
2019                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2020                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2021                     h->pps.scaling_matrix8[i][x]) << shift;
2022         }
2023     }
2024 }
2025
2026 static void init_dequant4_coeff_table(H264Context *h){
2027     int i,j,q,x;
2028     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2029     for(i=0; i<6; i++ ){
2030         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2031         for(j=0; j<i; j++){
2032             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2033                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2034                 break;
2035             }
2036         }
2037         if(j<i)
2038             continue;
2039
2040         for(q=0; q<52; q++){
2041             int shift = div6[q] + 2;
2042             int idx = rem6[q];
2043             for(x=0; x<16; x++)
2044                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2045                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2046                     h->pps.scaling_matrix4[i][x]) << shift;
2047         }
2048     }
2049 }
2050
2051 static void init_dequant_tables(H264Context *h){
2052     int i,x;
2053     init_dequant4_coeff_table(h);
2054     if(h->pps.transform_8x8_mode)
2055         init_dequant8_coeff_table(h);
2056     if(h->sps.transform_bypass){
2057         for(i=0; i<6; i++)
2058             for(x=0; x<16; x++)
2059                 h->dequant4_coeff[i][0][x] = 1<<6;
2060         if(h->pps.transform_8x8_mode)
2061             for(i=0; i<2; i++)
2062                 for(x=0; x<64; x++)
2063                     h->dequant8_coeff[i][0][x] = 1<<6;
2064     }
2065 }
2066
2067
2068 /**
2069  * allocates tables.
2070  * needs width/height
2071  */
2072 static int alloc_tables(H264Context *h){
2073     MpegEncContext * const s = &h->s;
2074     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2075     int x,y;
2076
2077     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2078
2079     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2080     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2081     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2082
2083     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2084     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2085     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2086     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2087
2088     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2089     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2090
2091     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2092     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2093     for(y=0; y<s->mb_height; y++){
2094         for(x=0; x<s->mb_width; x++){
2095             const int mb_xy= x + y*s->mb_stride;
2096             const int b_xy = 4*x + 4*y*h->b_stride;
2097             const int b8_xy= 2*x + 2*y*h->b8_stride;
2098
2099             h->mb2b_xy [mb_xy]= b_xy;
2100             h->mb2b8_xy[mb_xy]= b8_xy;
2101         }
2102     }
2103
2104     s->obmc_scratchpad = NULL;
2105
2106     if(!h->dequant4_coeff[0])
2107         init_dequant_tables(h);
2108
2109     return 0;
2110 fail:
2111     free_tables(h);
2112     return -1;
2113 }
2114
2115 /**
2116  * Mimic alloc_tables(), but for every context thread.
2117  */
2118 static void clone_tables(H264Context *dst, H264Context *src){
2119     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2120     dst->non_zero_count           = src->non_zero_count;
2121     dst->slice_table              = src->slice_table;
2122     dst->cbp_table                = src->cbp_table;
2123     dst->mb2b_xy                  = src->mb2b_xy;
2124     dst->mb2b8_xy                 = src->mb2b8_xy;
2125     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2126     dst->mvd_table[0]             = src->mvd_table[0];
2127     dst->mvd_table[1]             = src->mvd_table[1];
2128     dst->direct_table             = src->direct_table;
2129
2130     dst->s.obmc_scratchpad = NULL;
2131     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2132 }
2133
2134 /**
2135  * Init context
2136  * Allocate buffers which are not shared amongst multiple threads.
2137  */
2138 static int context_init(H264Context *h){
2139     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2140     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2141
2142     return 0;
2143 fail:
2144     return -1; // free_tables will clean up for us
2145 }
2146
2147 static av_cold void common_init(H264Context *h){
2148     MpegEncContext * const s = &h->s;
2149
2150     s->width = s->avctx->width;
2151     s->height = s->avctx->height;
2152     s->codec_id= s->avctx->codec->id;
2153
2154     ff_h264_pred_init(&h->hpc, s->codec_id);
2155
2156     h->dequant_coeff_pps= -1;
2157     s->unrestricted_mv=1;
2158     s->decode=1; //FIXME
2159
2160     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2161
2162     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2163     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2164 }
2165
2166 /**
2167  * Reset SEI values at the beginning of the frame.
2168  *
2169  * @param h H.264 context.
2170  */
2171 static void reset_sei(H264Context *h) {
2172     h->sei_recovery_frame_cnt       = -1;
2173     h->sei_dpb_output_delay         =  0;
2174     h->sei_cpb_removal_delay        = -1;
2175     h->sei_buffering_period_present =  0;
2176 }
2177
2178 static av_cold int decode_init(AVCodecContext *avctx){
2179     H264Context *h= avctx->priv_data;
2180     MpegEncContext * const s = &h->s;
2181
2182     MPV_decode_defaults(s);
2183
2184     s->avctx = avctx;
2185     common_init(h);
2186
2187     s->out_format = FMT_H264;
2188     s->workaround_bugs= avctx->workaround_bugs;
2189
2190     // set defaults
2191 //    s->decode_mb= ff_h263_decode_mb;
2192     s->quarter_sample = 1;
2193     if(!avctx->has_b_frames)
2194     s->low_delay= 1;
2195
2196     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2197         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2198     else
2199         avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2200     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2201
2202     decode_init_vlc();
2203
2204     if(avctx->extradata_size > 0 && avctx->extradata &&
2205        *(char *)avctx->extradata == 1){
2206         h->is_avc = 1;
2207         h->got_avcC = 0;
2208     } else {
2209         h->is_avc = 0;
2210     }
2211
2212     h->thread_context[0] = h;
2213     h->outputed_poc = INT_MIN;
2214     h->prev_poc_msb= 1<<16;
2215     reset_sei(h);
2216     if(avctx->codec_id == CODEC_ID_H264){
2217         if(avctx->ticks_per_frame == 1){
2218             s->avctx->time_base.den *=2;
2219         }
2220         avctx->ticks_per_frame = 2;
2221     }
2222     return 0;
2223 }
2224
2225 static int frame_start(H264Context *h){
2226     MpegEncContext * const s = &h->s;
2227     int i;
2228
2229     if(MPV_frame_start(s, s->avctx) < 0)
2230         return -1;
2231     ff_er_frame_start(s);
2232     /*
2233      * MPV_frame_start uses pict_type to derive key_frame.
2234      * This is incorrect for H.264; IDR markings must be used.
2235      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2236      * See decode_nal_units().
2237      */
2238     s->current_picture_ptr->key_frame= 0;
2239
2240     assert(s->linesize && s->uvlinesize);
2241
2242     for(i=0; i<16; i++){
2243         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2244         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2245     }
2246     for(i=0; i<4; i++){
2247         h->block_offset[16+i]=
2248         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2249         h->block_offset[24+16+i]=
2250         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2251     }
2252
2253     /* can't be in alloc_tables because linesize isn't known there.
2254      * FIXME: redo bipred weight to not require extra buffer? */
2255     for(i = 0; i < s->avctx->thread_count; i++)
2256         if(!h->thread_context[i]->s.obmc_scratchpad)
2257             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2258
2259     /* some macroblocks will be accessed before they're available */
2260     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2261         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2262
2263 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2264
2265     // We mark the current picture as non-reference after allocating it, so
2266     // that if we break out due to an error it can be released automatically
2267     // in the next MPV_frame_start().
2268     // SVQ3 as well as most other codecs have only last/next/current and thus
2269     // get released even with set reference, besides SVQ3 and others do not
2270     // mark frames as reference later "naturally".
2271     if(s->codec_id != CODEC_ID_SVQ3)
2272         s->current_picture_ptr->reference= 0;
2273
2274     s->current_picture_ptr->field_poc[0]=
2275     s->current_picture_ptr->field_poc[1]= INT_MAX;
2276     assert(s->current_picture_ptr->long_ref==0);
2277
2278     return 0;
2279 }
2280
2281 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2282     MpegEncContext * const s = &h->s;
2283     int i;
2284     int step    = 1;
2285     int offset  = 1;
2286     int uvoffset= 1;
2287     int top_idx = 1;
2288     int skiplast= 0;
2289
2290     src_y  -=   linesize;
2291     src_cb -= uvlinesize;
2292     src_cr -= uvlinesize;
2293
2294     if(!simple && FRAME_MBAFF){
2295         if(s->mb_y&1){
2296             offset  = MB_MBAFF ? 1 : 17;
2297             uvoffset= MB_MBAFF ? 1 : 9;
2298             if(!MB_MBAFF){
2299                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2300                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2301                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2302                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2303                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2304                 }
2305             }
2306         }else{
2307             if(!MB_MBAFF){
2308                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2309                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2310                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2311                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2312                 }
2313                 skiplast= 1;
2314             }
2315             offset  =
2316             uvoffset=
2317             top_idx = MB_MBAFF ? 0 : 1;
2318         }
2319         step= MB_MBAFF ? 2 : 1;
2320     }
2321
2322     // There are two lines saved, the line above the the top macroblock of a pair,
2323     // and the line above the bottom macroblock
2324     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2325     for(i=1; i<17 - skiplast; i++){
2326         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2327     }
2328
2329     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2330     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2331
2332     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2333         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2334         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2335         for(i=1; i<9 - skiplast; i++){
2336             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2337             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2338         }
2339         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2340         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2341     }
2342 }
2343
2344 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2345     MpegEncContext * const s = &h->s;
2346     int temp8, i;
2347     uint64_t temp64;
2348     int deblock_left;
2349     int deblock_top;
2350     int mb_xy;
2351     int step    = 1;
2352     int offset  = 1;
2353     int uvoffset= 1;
2354     int top_idx = 1;
2355
2356     if(!simple && FRAME_MBAFF){
2357         if(s->mb_y&1){
2358             offset  = MB_MBAFF ? 1 : 17;
2359             uvoffset= MB_MBAFF ? 1 : 9;
2360         }else{
2361             offset  =
2362             uvoffset=
2363             top_idx = MB_MBAFF ? 0 : 1;
2364         }
2365         step= MB_MBAFF ? 2 : 1;
2366     }
2367
2368     if(h->deblocking_filter == 2) {
2369         mb_xy = h->mb_xy;
2370         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2371         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2372     } else {
2373         deblock_left = (s->mb_x > 0);
2374         deblock_top =  (s->mb_y > !!MB_FIELD);
2375     }
2376
2377     src_y  -=   linesize + 1;
2378     src_cb -= uvlinesize + 1;
2379     src_cr -= uvlinesize + 1;
2380
2381 #define XCHG(a,b,t,xchg)\
2382 t= a;\
2383 if(xchg)\
2384     a= b;\
2385 b= t;
2386
2387     if(deblock_left){
2388         for(i = !deblock_top; i<16; i++){
2389             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2390         }
2391         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2392     }
2393
2394     if(deblock_top){
2395         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2396         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2397         if(s->mb_x+1 < s->mb_width){
2398             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2399         }
2400     }
2401
2402     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2403         if(deblock_left){
2404             for(i = !deblock_top; i<8; i++){
2405                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2406                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2407             }
2408             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2409             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2410         }
2411         if(deblock_top){
2412             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2413             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2414         }
2415     }
2416 }
2417
2418 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2419     MpegEncContext * const s = &h->s;
2420     const int mb_x= s->mb_x;
2421     const int mb_y= s->mb_y;
2422     const int mb_xy= h->mb_xy;
2423     const int mb_type= s->current_picture.mb_type[mb_xy];
2424     uint8_t  *dest_y, *dest_cb, *dest_cr;
2425     int linesize, uvlinesize /*dct_offset*/;
2426     int i;
2427     int *block_offset = &h->block_offset[0];
2428     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2429     /* is_h264 should always be true if SVQ3 is disabled. */
2430     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2431     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2432     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2433
2434     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2435     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2436     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2437
2438     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2439     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2440
2441     if (!simple && MB_FIELD) {
2442         linesize   = h->mb_linesize   = s->linesize * 2;
2443         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2444         block_offset = &h->block_offset[24];
2445         if(mb_y&1){ //FIXME move out of this function?
2446             dest_y -= s->linesize*15;
2447             dest_cb-= s->uvlinesize*7;
2448             dest_cr-= s->uvlinesize*7;
2449         }
2450         if(FRAME_MBAFF) {
2451             int list;
2452             for(list=0; list<h->list_count; list++){
2453                 if(!USES_LIST(mb_type, list))
2454                     continue;
2455                 if(IS_16X16(mb_type)){
2456                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2457                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2458                 }else{
2459                     for(i=0; i<16; i+=4){
2460                         int ref = h->ref_cache[list][scan8[i]];
2461                         if(ref >= 0)
2462                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2463                     }
2464                 }
2465             }
2466         }
2467     } else {
2468         linesize   = h->mb_linesize   = s->linesize;
2469         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2470 //        dct_offset = s->linesize * 16;
2471     }
2472
2473     if (!simple && IS_INTRA_PCM(mb_type)) {
2474         for (i=0; i<16; i++) {
2475             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2476         }
2477         for (i=0; i<8; i++) {
2478             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2479             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2480         }
2481     } else {
2482         if(IS_INTRA(mb_type)){
2483             if(h->deblocking_filter)
2484                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2485
2486             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2487                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2488                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2489             }
2490
2491             if(IS_INTRA4x4(mb_type)){
2492                 if(simple || !s->encoding){
2493                     if(IS_8x8DCT(mb_type)){
2494                         if(transform_bypass){
2495                             idct_dc_add =
2496                             idct_add    = s->dsp.add_pixels8;
2497                         }else{
2498                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2499                             idct_add    = s->dsp.h264_idct8_add;
2500                         }
2501                         for(i=0; i<16; i+=4){
2502                             uint8_t * const ptr= dest_y + block_offset[i];
2503                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2504                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2505                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2506                             }else{
2507                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2508                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2509                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2510                                 if(nnz){
2511                                     if(nnz == 1 && h->mb[i*16])
2512                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2513                                     else
2514                                         idct_add   (ptr, h->mb + i*16, linesize);
2515                                 }
2516                             }
2517                         }
2518                     }else{
2519                         if(transform_bypass){
2520                             idct_dc_add =
2521                             idct_add    = s->dsp.add_pixels4;
2522                         }else{
2523                             idct_dc_add = s->dsp.h264_idct_dc_add;
2524                             idct_add    = s->dsp.h264_idct_add;
2525                         }
2526                         for(i=0; i<16; i++){
2527                             uint8_t * const ptr= dest_y + block_offset[i];
2528                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2529
2530                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2531                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2532                             }else{
2533                                 uint8_t *topright;
2534                                 int nnz, tr;
2535                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2536                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2537                                     assert(mb_y || linesize <= block_offset[i]);
2538                                     if(!topright_avail){
2539                                         tr= ptr[3 - linesize]*0x01010101;
2540                                         topright= (uint8_t*) &tr;
2541                                     }else
2542                                         topright= ptr + 4 - linesize;
2543                                 }else
2544                                     topright= NULL;
2545
2546                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2547                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2548                                 if(nnz){
2549                                     if(is_h264){
2550                                         if(nnz == 1 && h->mb[i*16])
2551                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2552                                         else
2553                                             idct_add   (ptr, h->mb + i*16, linesize);
2554                                     }else
2555                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2556                                 }
2557                             }
2558                         }
2559                     }
2560                 }
2561             }else{
2562                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2563                 if(is_h264){
2564                     if(!transform_bypass)
2565                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2566                 }else
2567                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2568             }
2569             if(h->deblocking_filter)
2570                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2571         }else if(is_h264){
2572             hl_motion(h, dest_y, dest_cb, dest_cr,
2573                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2574                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2575                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2576         }
2577
2578
2579         if(!IS_INTRA4x4(mb_type)){
2580             if(is_h264){
2581                 if(IS_INTRA16x16(mb_type)){
2582                     if(transform_bypass){
2583                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2584                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2585                         }else{
2586                             for(i=0; i<16; i++){
2587                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2588                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2589                             }
2590                         }
2591                     }else{
2592                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2593                     }
2594                 }else if(h->cbp&15){
2595                     if(transform_bypass){
2596                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2597                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2598                         for(i=0; i<16; i+=di){
2599                             if(h->non_zero_count_cache[ scan8[i] ]){
2600                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2601                             }
2602                         }
2603                     }else{
2604                         if(IS_8x8DCT(mb_type)){
2605                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2606                         }else{
2607                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2608                         }
2609                     }
2610                 }
2611             }else{
2612                 for(i=0; i<16; i++){
2613                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2614                         uint8_t * const ptr= dest_y + block_offset[i];
2615                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2616                     }
2617                 }
2618             }
2619         }
2620
2621         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2622             uint8_t *dest[2] = {dest_cb, dest_cr};
2623             if(transform_bypass){
2624                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2625                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2626                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2627                 }else{
2628                     idct_add = s->dsp.add_pixels4;
2629                     for(i=16; i<16+8; i++){
2630                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2631                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2632                     }
2633                 }
2634             }else{
2635                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2636                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2637                 if(is_h264){
2638                     idct_add = s->dsp.h264_idct_add;
2639                     idct_dc_add = s->dsp.h264_idct_dc_add;
2640                     for(i=16; i<16+8; i++){
2641                         if(h->non_zero_count_cache[ scan8[i] ])
2642                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2643                         else if(h->mb[i*16])
2644                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2645                     }
2646                 }else{
2647                     for(i=16; i<16+8; i++){
2648                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2649                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2650                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2651                         }
2652                     }
2653                 }
2654             }
2655         }
2656     }
2657     if(h->cbp || IS_INTRA(mb_type))
2658         s->dsp.clear_blocks(h->mb);
2659
2660     if(h->deblocking_filter) {
2661         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2662         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2663         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2664         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2665         if (!simple && FRAME_MBAFF) {
2666             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2667         } else {
2668             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2669         }
2670     }
2671 }
2672
2673 /**
2674  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2675  */
2676 static void hl_decode_mb_simple(H264Context *h){
2677     hl_decode_mb_internal(h, 1);
2678 }
2679
2680 /**
2681  * Process a macroblock; this handles edge cases, such as interlacing.
2682  */
2683 static void av_noinline hl_decode_mb_complex(H264Context *h){
2684     hl_decode_mb_internal(h, 0);
2685 }
2686
2687 static void hl_decode_mb(H264Context *h){
2688     MpegEncContext * const s = &h->s;
2689     const int mb_xy= h->mb_xy;
2690     const int mb_type= s->current_picture.mb_type[mb_xy];
2691     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2692
2693     if (is_complex)
2694         hl_decode_mb_complex(h);
2695     else hl_decode_mb_simple(h);
2696 }
2697
2698 static void pic_as_field(Picture *pic, const int parity){
2699     int i;
2700     for (i = 0; i < 4; ++i) {
2701         if (parity == PICT_BOTTOM_FIELD)
2702             pic->data[i] += pic->linesize[i];
2703         pic->reference = parity;
2704         pic->linesize[i] *= 2;
2705     }
2706     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2707 }
2708
2709 static int split_field_copy(Picture *dest, Picture *src,
2710                             int parity, int id_add){
2711     int match = !!(src->reference & parity);
2712
2713     if (match) {
2714         *dest = *src;
2715         if(parity != PICT_FRAME){
2716             pic_as_field(dest, parity);
2717             dest->pic_id *= 2;
2718             dest->pic_id += id_add;
2719         }
2720     }
2721
2722     return match;
2723 }
2724
2725 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2726     int i[2]={0};
2727     int index=0;
2728
2729     while(i[0]<len || i[1]<len){
2730         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2731             i[0]++;
2732         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2733             i[1]++;
2734         if(i[0] < len){
2735             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2736             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2737         }
2738         if(i[1] < len){
2739             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2740             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2741         }
2742     }
2743
2744     return index;
2745 }
2746
2747 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2748     int i, best_poc;
2749     int out_i= 0;
2750
2751     for(;;){
2752         best_poc= dir ? INT_MIN : INT_MAX;
2753
2754         for(i=0; i<len; i++){
2755             const int poc= src[i]->poc;
2756             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2757                 best_poc= poc;
2758                 sorted[out_i]= src[i];
2759             }
2760         }
2761         if(best_poc == (dir ? INT_MIN : INT_MAX))
2762             break;
2763         limit= sorted[out_i++]->poc - dir;
2764     }
2765     return out_i;
2766 }
2767
2768 /**
2769  * fills the default_ref_list.
2770  */
2771 static int fill_default_ref_list(H264Context *h){
2772     MpegEncContext * const s = &h->s;
2773     int i, len;
2774
2775     if(h->slice_type_nos==FF_B_TYPE){
2776         Picture *sorted[32];
2777         int cur_poc, list;
2778         int lens[2];
2779
2780         if(FIELD_PICTURE)
2781             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2782         else
2783             cur_poc= s->current_picture_ptr->poc;
2784
2785         for(list= 0; list<2; list++){
2786             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2787             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2788             assert(len<=32);
2789             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2790             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2791             assert(len<=32);
2792
2793             if(len < h->ref_count[list])
2794                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2795             lens[list]= len;
2796         }
2797
2798         if(lens[0] == lens[1] && lens[1] > 1){
2799             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2800             if(i == lens[0])
2801                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2802         }
2803     }else{
2804         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2805         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2806         assert(len <= 32);
2807         if(len < h->ref_count[0])
2808             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2809     }
2810 #ifdef TRACE
2811     for (i=0; i<h->ref_count[0]; i++) {
2812         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2813     }
2814     if(h->slice_type_nos==FF_B_TYPE){
2815         for (i=0; i<h->ref_count[1]; i++) {
2816             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2817         }
2818     }
2819 #endif
2820     return 0;
2821 }
2822
2823 static void print_short_term(H264Context *h);
2824 static void print_long_term(H264Context *h);
2825
2826 /**
2827  * Extract structure information about the picture described by pic_num in
2828  * the current decoding context (frame or field). Note that pic_num is
2829  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2830  * @param pic_num picture number for which to extract structure information
2831  * @param structure one of PICT_XXX describing structure of picture
2832  *                      with pic_num
2833  * @return frame number (short term) or long term index of picture
2834  *         described by pic_num
2835  */
2836 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2837     MpegEncContext * const s = &h->s;
2838
2839     *structure = s->picture_structure;
2840     if(FIELD_PICTURE){
2841         if (!(pic_num & 1))
2842             /* opposite field */
2843             *structure ^= PICT_FRAME;
2844         pic_num >>= 1;
2845     }
2846
2847     return pic_num;
2848 }
2849
2850 static int decode_ref_pic_list_reordering(H264Context *h){
2851     MpegEncContext * const s = &h->s;
2852     int list, index, pic_structure;
2853
2854     print_short_term(h);
2855     print_long_term(h);
2856
2857     for(list=0; list<h->list_count; list++){
2858         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2859
2860         if(get_bits1(&s->gb)){
2861             int pred= h->curr_pic_num;
2862
2863             for(index=0; ; index++){
2864                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2865                 unsigned int pic_id;
2866                 int i;
2867                 Picture *ref = NULL;
2868
2869                 if(reordering_of_pic_nums_idc==3)
2870                     break;
2871
2872                 if(index >= h->ref_count[list]){
2873                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2874                     return -1;
2875                 }
2876
2877                 if(reordering_of_pic_nums_idc<3){
2878                     if(reordering_of_pic_nums_idc<2){
2879                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2880                         int frame_num;
2881
2882                         if(abs_diff_pic_num > h->max_pic_num){
2883                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2884                             return -1;
2885                         }
2886
2887                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2888                         else                                pred+= abs_diff_pic_num;
2889                         pred &= h->max_pic_num - 1;
2890
2891                         frame_num = pic_num_extract(h, pred, &pic_structure);
2892
2893                         for(i= h->short_ref_count-1; i>=0; i--){
2894                             ref = h->short_ref[i];
2895                             assert(ref->reference);
2896                             assert(!ref->long_ref);
2897                             if(
2898                                    ref->frame_num == frame_num &&
2899                                    (ref->reference & pic_structure)
2900                               )
2901                                 break;
2902                         }
2903                         if(i>=0)
2904                             ref->pic_id= pred;
2905                     }else{
2906                         int long_idx;
2907                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2908
2909                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2910
2911                         if(long_idx>31){
2912                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2913                             return -1;
2914                         }
2915                         ref = h->long_ref[long_idx];
2916                         assert(!(ref && !ref->reference));
2917                         if(ref && (ref->reference & pic_structure)){
2918                             ref->pic_id= pic_id;
2919                             assert(ref->long_ref);
2920                             i=0;
2921                         }else{
2922                             i=-1;
2923                         }
2924                     }
2925
2926                     if (i < 0) {
2927                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2928                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2929                     } else {
2930                         for(i=index; i+1<h->ref_count[list]; i++){
2931                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2932                                 break;
2933                         }
2934                         for(; i > index; i--){
2935                             h->ref_list[list][i]= h->ref_list[list][i-1];
2936                         }
2937                         h->ref_list[list][index]= *ref;
2938                         if (FIELD_PICTURE){
2939                             pic_as_field(&h->ref_list[list][index], pic_structure);
2940                         }
2941                     }
2942                 }else{
2943                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2944                     return -1;
2945                 }
2946             }
2947         }
2948     }
2949     for(list=0; list<h->list_count; list++){
2950         for(index= 0; index < h->ref_count[list]; index++){
2951             if(!h->ref_list[list][index].data[0]){
2952                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2953                 if(h->default_ref_list[list][0].data[0])
2954                     h->ref_list[list][index]= h->default_ref_list[list][0];
2955                 else
2956                     return -1;
2957             }
2958         }
2959     }
2960
2961     return 0;
2962 }
2963
2964 static void fill_mbaff_ref_list(H264Context *h){
2965     int list, i, j;
2966     for(list=0; list<2; list++){ //FIXME try list_count
2967         for(i=0; i<h->ref_count[list]; i++){
2968             Picture *frame = &h->ref_list[list][i];
2969             Picture *field = &h->ref_list[list][16+2*i];
2970             field[0] = *frame;
2971             for(j=0; j<3; j++)
2972                 field[0].linesize[j] <<= 1;
2973             field[0].reference = PICT_TOP_FIELD;
2974             field[0].poc= field[0].field_poc[0];
2975             field[1] = field[0];
2976             for(j=0; j<3; j++)
2977                 field[1].data[j] += frame->linesize[j];
2978             field[1].reference = PICT_BOTTOM_FIELD;
2979             field[1].poc= field[1].field_poc[1];
2980
2981             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2982             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2983             for(j=0; j<2; j++){
2984                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2985                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2986             }
2987         }
2988     }
2989     for(j=0; j<h->ref_count[1]; j++){
2990         for(i=0; i<h->ref_count[0]; i++)
2991             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2992         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2993         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2994     }
2995 }
2996
2997 static int pred_weight_table(H264Context *h){
2998     MpegEncContext * const s = &h->s;
2999     int list, i;
3000     int luma_def, chroma_def;
3001
3002     h->use_weight= 0;
3003     h->use_weight_chroma= 0;
3004     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3005     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3006     luma_def = 1<<h->luma_log2_weight_denom;
3007     chroma_def = 1<<h->chroma_log2_weight_denom;
3008
3009     for(list=0; list<2; list++){
3010         h->luma_weight_flag[list]   = 0;
3011         h->chroma_weight_flag[list] = 0;
3012         for(i=0; i<h->ref_count[list]; i++){
3013             int luma_weight_flag, chroma_weight_flag;
3014
3015             luma_weight_flag= get_bits1(&s->gb);
3016             if(luma_weight_flag){
3017                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3018                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3019                 if(   h->luma_weight[list][i] != luma_def
3020                    || h->luma_offset[list][i] != 0) {
3021                     h->use_weight= 1;
3022                     h->luma_weight_flag[list]= 1;
3023                 }
3024             }else{
3025                 h->luma_weight[list][i]= luma_def;
3026                 h->luma_offset[list][i]= 0;
3027             }
3028
3029             if(CHROMA){
3030                 chroma_weight_flag= get_bits1(&s->gb);
3031                 if(chroma_weight_flag){
3032                     int j;
3033                     for(j=0; j<2; j++){
3034                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3035                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3036                         if(   h->chroma_weight[list][i][j] != chroma_def
3037                            || h->chroma_offset[list][i][j] != 0) {
3038                             h->use_weight_chroma= 1;
3039                             h->chroma_weight_flag[list]= 1;
3040                         }
3041                     }
3042                 }else{
3043                     int j;
3044                     for(j=0; j<2; j++){
3045                         h->chroma_weight[list][i][j]= chroma_def;
3046                         h->chroma_offset[list][i][j]= 0;
3047                     }
3048                 }
3049             }
3050         }
3051         if(h->slice_type_nos != FF_B_TYPE) break;
3052     }
3053     h->use_weight= h->use_weight || h->use_weight_chroma;
3054     return 0;
3055 }
3056
3057 static void implicit_weight_table(H264Context *h){
3058     MpegEncContext * const s = &h->s;
3059     int ref0, ref1, i;
3060     int cur_poc = s->current_picture_ptr->poc;
3061
3062     for (i = 0; i < 2; i++) {
3063         h->luma_weight_flag[i]   = 0;
3064         h->chroma_weight_flag[i] = 0;
3065     }
3066
3067     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3068        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3069         h->use_weight= 0;
3070         h->use_weight_chroma= 0;
3071         return;
3072     }
3073
3074     h->use_weight= 2;
3075     h->use_weight_chroma= 2;
3076     h->luma_log2_weight_denom= 5;
3077     h->chroma_log2_weight_denom= 5;
3078
3079     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3080         int poc0 = h->ref_list[0][ref0].poc;
3081         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3082             int poc1 = h->ref_list[1][ref1].poc;
3083             int td = av_clip(poc1 - poc0, -128, 127);
3084             if(td){
3085                 int tb = av_clip(cur_poc - poc0, -128, 127);
3086                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3087                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3088                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3089                     h->implicit_weight[ref0][ref1] = 32;
3090                 else
3091                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3092             }else
3093                 h->implicit_weight[ref0][ref1] = 32;
3094         }
3095     }
3096 }
3097
3098 /**
3099  * Mark a picture as no longer needed for reference. The refmask
3100  * argument allows unreferencing of individual fields or the whole frame.
3101  * If the picture becomes entirely unreferenced, but is being held for
3102  * display purposes, it is marked as such.
3103  * @param refmask mask of fields to unreference; the mask is bitwise
3104  *                anded with the reference marking of pic
3105  * @return non-zero if pic becomes entirely unreferenced (except possibly
3106  *         for display purposes) zero if one of the fields remains in
3107  *         reference
3108  */
3109 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3110     int i;
3111     if (pic->reference &= refmask) {
3112         return 0;
3113     } else {
3114         for(i = 0; h->delayed_pic[i]; i++)
3115             if(pic == h->delayed_pic[i]){
3116                 pic->reference=DELAYED_PIC_REF;
3117                 break;
3118             }
3119         return 1;
3120     }
3121 }
3122
3123 /**
3124  * instantaneous decoder refresh.
3125  */
3126 static void idr(H264Context *h){
3127     int i;
3128
3129     for(i=0; i<16; i++){
3130         remove_long(h, i, 0);
3131     }
3132     assert(h->long_ref_count==0);
3133
3134     for(i=0; i<h->short_ref_count; i++){
3135         unreference_pic(h, h->short_ref[i], 0);
3136         h->short_ref[i]= NULL;
3137     }
3138     h->short_ref_count=0;
3139     h->prev_frame_num= 0;
3140     h->prev_frame_num_offset= 0;
3141     h->prev_poc_msb=
3142     h->prev_poc_lsb= 0;
3143 }
3144
3145 /* forget old pics after a seek */
3146 static void flush_dpb(AVCodecContext *avctx){
3147     H264Context *h= avctx->priv_data;
3148     int i;
3149     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3150         if(h->delayed_pic[i])
3151             h->delayed_pic[i]->reference= 0;
3152         h->delayed_pic[i]= NULL;
3153     }
3154     h->outputed_poc= INT_MIN;
3155     idr(h);
3156     if(h->s.current_picture_ptr)
3157         h->s.current_picture_ptr->reference= 0;
3158     h->s.first_field= 0;
3159     reset_sei(h);
3160     ff_mpeg_flush(avctx);
3161 }
3162
3163 /**
3164  * Find a Picture in the short term reference list by frame number.
3165  * @param frame_num frame number to search for
3166  * @param idx the index into h->short_ref where returned picture is found
3167  *            undefined if no picture found.
3168  * @return pointer to the found picture, or NULL if no pic with the provided
3169  *                 frame number is found
3170  */
3171 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3172     MpegEncContext * const s = &h->s;
3173     int i;
3174
3175     for(i=0; i<h->short_ref_count; i++){
3176         Picture *pic= h->short_ref[i];
3177         if(s->avctx->debug&FF_DEBUG_MMCO)
3178             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3179         if(pic->frame_num == frame_num) {
3180             *idx = i;
3181             return pic;
3182         }
3183     }
3184     return NULL;
3185 }
3186
3187 /**
3188  * Remove a picture from the short term reference list by its index in
3189  * that list.  This does no checking on the provided index; it is assumed
3190  * to be valid. Other list entries are shifted down.
3191  * @param i index into h->short_ref of picture to remove.
3192  */
3193 static void remove_short_at_index(H264Context *h, int i){
3194     assert(i >= 0 && i < h->short_ref_count);
3195     h->short_ref[i]= NULL;
3196     if (--h->short_ref_count)
3197         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3198 }
3199
3200 /**
3201  *
3202  * @return the removed picture or NULL if an error occurs
3203  */
3204 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3205     MpegEncContext * const s = &h->s;
3206     Picture *pic;
3207     int i;
3208
3209     if(s->avctx->debug&FF_DEBUG_MMCO)
3210         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3211
3212     pic = find_short(h, frame_num, &i);
3213     if (pic){
3214         if(unreference_pic(h, pic, ref_mask))
3215         remove_short_at_index(h, i);
3216     }
3217
3218     return pic;
3219 }
3220
3221 /**
3222  * Remove a picture from the long term reference list by its index in
3223  * that list.
3224  * @return the removed picture or NULL if an error occurs
3225  */
3226 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3227     Picture *pic;
3228
3229     pic= h->long_ref[i];
3230     if (pic){
3231         if(unreference_pic(h, pic, ref_mask)){
3232             assert(h->long_ref[i]->long_ref == 1);
3233             h->long_ref[i]->long_ref= 0;
3234             h->long_ref[i]= NULL;
3235             h->long_ref_count--;
3236         }
3237     }
3238
3239     return pic;
3240 }
3241
3242 /**
3243  * print short term list
3244  */
3245 static void print_short_term(H264Context *h) {
3246     uint32_t i;
3247     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3248         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3249         for(i=0; i<h->short_ref_count; i++){
3250             Picture *pic= h->short_ref[i];
3251             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3252         }
3253     }
3254 }
3255
3256 /**
3257  * print long term list
3258  */
3259 static void print_long_term(H264Context *h) {
3260     uint32_t i;
3261     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3262         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3263         for(i = 0; i < 16; i++){
3264             Picture *pic= h->long_ref[i];
3265             if (pic) {
3266                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3267             }
3268         }
3269     }
3270 }
3271
3272 /**
3273  * Executes the reference picture marking (memory management control operations).
3274  */
3275 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3276     MpegEncContext * const s = &h->s;
3277     int i, av_uninit(j);
3278     int current_ref_assigned=0;
3279     Picture *av_uninit(pic);
3280
3281     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3282         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3283
3284     for(i=0; i<mmco_count; i++){
3285         int av_uninit(structure), av_uninit(frame_num);
3286         if(s->avctx->debug&FF_DEBUG_MMCO)
3287             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3288
3289         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3290            || mmco[i].opcode == MMCO_SHORT2LONG){
3291             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3292             pic = find_short(h, frame_num, &j);
3293             if(!pic){
3294                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3295                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3296                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3297                 continue;
3298             }
3299         }
3300
3301         switch(mmco[i].opcode){
3302         case MMCO_SHORT2UNUSED:
3303             if(s->avctx->debug&FF_DEBUG_MMCO)
3304                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3305             remove_short(h, frame_num, structure ^ PICT_FRAME);
3306             break;
3307         case MMCO_SHORT2LONG:
3308                 if (h->long_ref[mmco[i].long_arg] != pic)
3309                     remove_long(h, mmco[i].long_arg, 0);
3310
3311                 remove_short_at_index(h, j);
3312                 h->long_ref[ mmco[i].long_arg ]= pic;
3313                 if (h->long_ref[ mmco[i].long_arg ]){
3314                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3315                     h->long_ref_count++;
3316                 }
3317             break;
3318         case MMCO_LONG2UNUSED:
3319             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3320             pic = h->long_ref[j];
3321             if (pic) {
3322                 remove_long(h, j, structure ^ PICT_FRAME);
3323             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3324                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3325             break;
3326         case MMCO_LONG:
3327                     // Comment below left from previous code as it is an interresting note.
3328                     /* First field in pair is in short term list or
3329                      * at a different long term index.
3330                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3331                      * Report the problem and keep the pair where it is,
3332                      * and mark this field valid.
3333                      */
3334
3335             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3336                 remove_long(h, mmco[i].long_arg, 0);
3337
3338                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3339                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3340                 h->long_ref_count++;
3341             }
3342
3343             s->current_picture_ptr->reference |= s->picture_structure;
3344             current_ref_assigned=1;
3345             break;
3346         case MMCO_SET_MAX_LONG:
3347             assert(mmco[i].long_arg <= 16);
3348             // just remove the long term which index is greater than new max
3349             for(j = mmco[i].long_arg; j<16; j++){
3350                 remove_long(h, j, 0);
3351             }
3352             break;
3353         case MMCO_RESET:
3354             while(h->short_ref_count){
3355                 remove_short(h, h->short_ref[0]->frame_num, 0);
3356             }
3357             for(j = 0; j < 16; j++) {
3358                 remove_long(h, j, 0);
3359             }
3360             s->current_picture_ptr->poc=
3361             s->current_picture_ptr->field_poc[0]=
3362             s->current_picture_ptr->field_poc[1]=
3363             h->poc_lsb=
3364             h->poc_msb=
3365             h->frame_num=
3366             s->current_picture_ptr->frame_num= 0;
3367             break;
3368         default: assert(0);
3369         }
3370     }
3371
3372     if (!current_ref_assigned) {
3373         /* Second field of complementary field pair; the first field of
3374          * which is already referenced. If short referenced, it
3375          * should be first entry in short_ref. If not, it must exist
3376          * in long_ref; trying to put it on the short list here is an
3377          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3378          */
3379         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3380             /* Just mark the second field valid */
3381             s->current_picture_ptr->reference = PICT_FRAME;
3382         } else if (s->current_picture_ptr->long_ref) {
3383             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3384                                              "assignment for second field "
3385                                              "in complementary field pair "
3386                                              "(first field is long term)\n");
3387         } else {
3388             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3389             if(pic){
3390                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3391             }
3392
3393             if(h->short_ref_count)
3394                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3395
3396             h->short_ref[0]= s->current_picture_ptr;
3397             h->short_ref_count++;
3398             s->current_picture_ptr->reference |= s->picture_structure;
3399         }
3400     }
3401
3402     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3403
3404         /* We have too many reference frames, probably due to corrupted
3405          * stream. Need to discard one frame. Prevents overrun of the
3406          * short_ref and long_ref buffers.
3407          */
3408         av_log(h->s.avctx, AV_LOG_ERROR,
3409                "number of reference frames exceeds max (probably "
3410                "corrupt input), discarding one\n");
3411
3412         if (h->long_ref_count && !h->short_ref_count) {
3413             for (i = 0; i < 16; ++i)
3414                 if (h->long_ref[i])
3415                     break;
3416
3417             assert(i < 16);
3418             remove_long(h, i, 0);
3419         } else {
3420             pic = h->short_ref[h->short_ref_count - 1];
3421             remove_short(h, pic->frame_num, 0);
3422         }
3423     }
3424
3425     print_short_term(h);
3426     print_long_term(h);
3427     return 0;
3428 }
3429
3430 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3431     MpegEncContext * const s = &h->s;
3432     int i;
3433
3434     h->mmco_index= 0;
3435     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3436         s->broken_link= get_bits1(gb) -1;
3437         if(get_bits1(gb)){
3438             h->mmco[0].opcode= MMCO_LONG;
3439             h->mmco[0].long_arg= 0;
3440             h->mmco_index= 1;
3441         }
3442     }else{
3443         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3444             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3445                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3446
3447                 h->mmco[i].opcode= opcode;
3448                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3449                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3450 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3451                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3452                         return -1;
3453                     }*/
3454                 }
3455                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3456                     unsigned int long_arg= get_ue_golomb_31(gb);
3457                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3458                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3459                         return -1;
3460                     }
3461                     h->mmco[i].long_arg= long_arg;
3462                 }
3463
3464                 if(opcode > (unsigned)MMCO_LONG){
3465                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3466                     return -1;
3467                 }
3468                 if(opcode == MMCO_END)
3469                     break;
3470             }
3471             h->mmco_index= i;
3472         }else{
3473             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3474
3475             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3476                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3477                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3478                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3479                 h->mmco_index= 1;
3480                 if (FIELD_PICTURE) {
3481                     h->mmco[0].short_pic_num *= 2;
3482                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3483                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3484                     h->mmco_index= 2;
3485                 }
3486             }
3487         }
3488     }
3489
3490     return 0;
3491 }
3492
3493 static int init_poc(H264Context *h){
3494     MpegEncContext * const s = &h->s;
3495     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3496     int field_poc[2];
3497     Picture *cur = s->current_picture_ptr;
3498
3499     h->frame_num_offset= h->prev_frame_num_offset;
3500     if(h->frame_num < h->prev_frame_num)
3501         h->frame_num_offset += max_frame_num;
3502
3503     if(h->sps.poc_type==0){
3504         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3505
3506         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3507             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3508         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3509             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3510         else
3511             h->poc_msb = h->prev_poc_msb;
3512 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3513         field_poc[0] =
3514         field_poc[1] = h->poc_msb + h->poc_lsb;
3515         if(s->picture_structure == PICT_FRAME)
3516             field_poc[1] += h->delta_poc_bottom;
3517     }else if(h->sps.poc_type==1){
3518         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3519         int i;
3520
3521         if(h->sps.poc_cycle_length != 0)
3522             abs_frame_num = h->frame_num_offset + h->frame_num;
3523         else
3524             abs_frame_num = 0;
3525
3526         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3527             abs_frame_num--;
3528
3529         expected_delta_per_poc_cycle = 0;
3530         for(i=0; i < h->sps.poc_cycle_length; i++)
3531             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3532
3533         if(abs_frame_num > 0){
3534             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3535             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3536
3537             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3538             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3539                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3540         } else
3541             expectedpoc = 0;
3542
3543         if(h->nal_ref_idc == 0)
3544             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3545
3546         field_poc[0] = expectedpoc + h->delta_poc[0];
3547         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3548
3549         if(s->picture_structure == PICT_FRAME)
3550             field_poc[1] += h->delta_poc[1];
3551     }else{
3552         int poc= 2*(h->frame_num_offset + h->frame_num);
3553
3554         if(!h->nal_ref_idc)
3555             poc--;
3556
3557         field_poc[0]= poc;
3558         field_poc[1]= poc;
3559     }
3560
3561     if(s->picture_structure != PICT_BOTTOM_FIELD)
3562         s->current_picture_ptr->field_poc[0]= field_poc[0];
3563     if(s->picture_structure != PICT_TOP_FIELD)
3564         s->current_picture_ptr->field_poc[1]= field_poc[1];
3565     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3566
3567     return 0;
3568 }
3569
3570
3571 /**
3572  * initialize scan tables
3573  */
3574 static void init_scan_tables(H264Context *h){
3575     MpegEncContext * const s = &h->s;
3576     int i;
3577     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3578         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3579         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3580     }else{
3581         for(i=0; i<16; i++){
3582 #define T(x) (x>>2) | ((x<<2) & 0xF)
3583             h->zigzag_scan[i] = T(zigzag_scan[i]);
3584             h-> field_scan[i] = T( field_scan[i]);
3585 #undef T
3586         }
3587     }
3588     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3589         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3590         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3591         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3592         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3593     }else{
3594         for(i=0; i<64; i++){
3595 #define T(x) (x>>3) | ((x&7)<<3)
3596             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3597             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3598             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3599             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3600 #undef T
3601         }
3602     }
3603     if(h->sps.transform_bypass){ //FIXME same ugly
3604         h->zigzag_scan_q0          = zigzag_scan;
3605         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3606         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3607         h->field_scan_q0           = field_scan;
3608         h->field_scan8x8_q0        = field_scan8x8;
3609         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3610     }else{
3611         h->zigzag_scan_q0          = h->zigzag_scan;
3612         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3613         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3614         h->field_scan_q0           = h->field_scan;
3615         h->field_scan8x8_q0        = h->field_scan8x8;
3616         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3617     }
3618 }
3619
3620 static void field_end(H264Context *h){
3621     MpegEncContext * const s = &h->s;
3622     AVCodecContext * const avctx= s->avctx;
3623     s->mb_y= 0;
3624
3625     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
3626     s->current_picture_ptr->pict_type= s->pict_type;
3627
3628     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3629         ff_vdpau_h264_set_reference_frames(s);
3630
3631     if(!s->dropable) {
3632         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
3633         h->prev_poc_msb= h->poc_msb;
3634         h->prev_poc_lsb= h->poc_lsb;
3635     }
3636     h->prev_frame_num_offset= h->frame_num_offset;
3637     h->prev_frame_num= h->frame_num;
3638
3639     if (avctx->hwaccel) {
3640         if (avctx->hwaccel->end_frame(avctx) < 0)
3641             av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
3642     }
3643
3644     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3645         ff_vdpau_h264_picture_complete(s);
3646
3647     /*
3648      * FIXME: Error handling code does not seem to support interlaced
3649      * when slices span multiple rows
3650      * The ff_er_add_slice calls don't work right for bottom
3651      * fields; they cause massive erroneous error concealing
3652      * Error marking covers both fields (top and bottom).
3653      * This causes a mismatched s->error_count
3654      * and a bad error table. Further, the error count goes to
3655      * INT_MAX when called for bottom field, because mb_y is
3656      * past end by one (callers fault) and resync_mb_y != 0
3657      * causes problems for the first MB line, too.
3658      */
3659     if (!FIELD_PICTURE)
3660         ff_er_frame_end(s);
3661
3662     MPV_frame_end(s);
3663
3664     h->current_slice=0;
3665 }
3666
3667 /**
3668  * Replicates H264 "master" context to thread contexts.
3669  */
3670 static void clone_slice(H264Context *dst, H264Context *src)
3671 {
3672     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3673     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3674     dst->s.current_picture      = src->s.current_picture;
3675     dst->s.linesize             = src->s.linesize;
3676     dst->s.uvlinesize           = src->s.uvlinesize;
3677     dst->s.first_field          = src->s.first_field;
3678
3679     dst->prev_poc_msb           = src->prev_poc_msb;
3680     dst->prev_poc_lsb           = src->prev_poc_lsb;
3681     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3682     dst->prev_frame_num         = src->prev_frame_num;
3683     dst->short_ref_count        = src->short_ref_count;
3684
3685     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3686     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3687     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3688     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3689
3690     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3691     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3692 }
3693
3694 /**
3695  * decodes a slice header.
3696  * This will also call MPV_common_init() and frame_start() as needed.
3697  *
3698  * @param h h264context
3699  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3700  *
3701  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3702  */
3703 static int decode_slice_header(H264Context *h, H264Context *h0){
3704     MpegEncContext * const s = &h->s;
3705     MpegEncContext * const s0 = &h0->s;
3706     unsigned int first_mb_in_slice;
3707     unsigned int pps_id;
3708     int num_ref_idx_active_override_flag;
3709     unsigned int slice_type, tmp, i, j;
3710     int default_ref_list_done = 0;
3711     int last_pic_structure;
3712
3713     s->dropable= h->nal_ref_idc == 0;
3714
3715     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3716         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3717         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3718     }else{
3719         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3720         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3721     }
3722
3723     first_mb_in_slice= get_ue_golomb(&s->gb);
3724
3725     if(first_mb_in_slice == 0){ //FIXME better field boundary detection
3726         if(h0->current_slice && FIELD_PICTURE){
3727             field_end(h);
3728         }
3729
3730         h0->current_slice = 0;
3731         if (!s0->first_field)
3732             s->current_picture_ptr= NULL;
3733     }
3734
3735     slice_type= get_ue_golomb_31(&s->gb);
3736     if(slice_type > 9){
3737         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3738         return -1;
3739     }
3740     if(slice_type > 4){
3741         slice_type -= 5;
3742         h->slice_type_fixed=1;
3743     }else
3744         h->slice_type_fixed=0;
3745
3746     slice_type= golomb_to_pict_type[ slice_type ];
3747     if (slice_type == FF_I_TYPE
3748         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3749         default_ref_list_done = 1;
3750     }
3751     h->slice_type= slice_type;
3752     h->slice_type_nos= slice_type & 3;
3753
3754     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3755     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3756         av_log(h->s.avctx, AV_LOG_ERROR,
3757                "B picture before any references, skipping\n");
3758         return -1;
3759     }
3760
3761     pps_id= get_ue_golomb(&s->gb);
3762     if(pps_id>=MAX_PPS_COUNT){
3763         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3764         return -1;
3765     }
3766     if(!h0->pps_buffers[pps_id]) {
3767         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
3768         return -1;
3769     }
3770     h->pps= *h0->pps_buffers[pps_id];
3771
3772     if(!h0->sps_buffers[h->pps.sps_id]) {
3773         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
3774         return -1;
3775     }
3776     h->sps = *h0->sps_buffers[h->pps.sps_id];
3777
3778     if(h == h0 && h->dequant_coeff_pps != pps_id){
3779         h->dequant_coeff_pps = pps_id;
3780         init_dequant_tables(h);
3781     }
3782
3783     s->mb_width= h->sps.mb_width;
3784     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3785
3786     h->b_stride=  s->mb_width*4;
3787     h->b8_stride= s->mb_width*2;
3788
3789     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3790     if(h->sps.frame_mbs_only_flag)
3791         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3792     else
3793         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3794
3795     if (s->context_initialized
3796         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3797         if(h != h0)
3798             return -1;   // width / height changed during parallelized decoding
3799         free_tables(h);
3800         flush_dpb(s->avctx);
3801         MPV_common_end(s);
3802     }
3803     if (!s->context_initialized) {
3804         if(h != h0)
3805             return -1;  // we cant (re-)initialize context during parallel decoding
3806         if (MPV_common_init(s) < 0)
3807             return -1;
3808         s->first_field = 0;
3809
3810         init_scan_tables(h);
3811         alloc_tables(h);
3812
3813         for(i = 1; i < s->avctx->thread_count; i++) {
3814             H264Context *c;
3815             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3816             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3817             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3818             c->sps = h->sps;
3819             c->pps = h->pps;
3820             init_scan_tables(c);
3821             clone_tables(c, h);
3822         }
3823
3824         for(i = 0; i < s->avctx->thread_count; i++)
3825             if(context_init(h->thread_context[i]) < 0)
3826                 return -1;
3827
3828         s->avctx->width = s->width;
3829         s->avctx->height = s->height;
3830         s->avctx->sample_aspect_ratio= h->sps.sar;
3831         if(!s->avctx->sample_aspect_ratio.den)
3832             s->avctx->sample_aspect_ratio.den = 1;
3833
3834         if(h->sps.timing_info_present_flag){
3835             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3836             if(h->x264_build > 0 && h->x264_build < 44)
3837                 s->avctx->time_base.den *= 2;
3838             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3839                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3840         }
3841     }
3842
3843     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3844
3845     h->mb_mbaff = 0;
3846     h->mb_aff_frame = 0;
3847     last_pic_structure = s0->picture_structure;
3848     if(h->sps.frame_mbs_only_flag){
3849         s->picture_structure= PICT_FRAME;
3850     }else{
3851         if(get_bits1(&s->gb)) { //field_pic_flag
3852             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3853         } else {
3854             s->picture_structure= PICT_FRAME;
3855             h->mb_aff_frame = h->sps.mb_aff;
3856         }
3857     }
3858     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3859
3860     if(h0->current_slice == 0){
3861         while(h->frame_num !=  h->prev_frame_num &&
3862               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3863             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3864             if (frame_start(h) < 0)
3865                 return -1;
3866             h->prev_frame_num++;
3867             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3868             s->current_picture_ptr->frame_num= h->prev_frame_num;
3869             execute_ref_pic_marking(h, NULL, 0);
3870         }
3871
3872         /* See if we have a decoded first field looking for a pair... */
3873         if (s0->first_field) {
3874             assert(s0->current_picture_ptr);
3875             assert(s0->current_picture_ptr->data[0]);
3876             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3877
3878             /* figure out if we have a complementary field pair */
3879             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3880                 /*
3881                  * Previous field is unmatched. Don't display it, but let it
3882                  * remain for reference if marked as such.
3883                  */
3884                 s0->current_picture_ptr = NULL;
3885                 s0->first_field = FIELD_PICTURE;
3886
3887             } else {
3888                 if (h->nal_ref_idc &&
3889                         s0->current_picture_ptr->reference &&
3890                         s0->current_picture_ptr->frame_num != h->frame_num) {
3891                     /*
3892                      * This and previous field were reference, but had
3893                      * different frame_nums. Consider this field first in
3894                      * pair. Throw away previous field except for reference
3895                      * purposes.
3896                      */
3897                     s0->first_field = 1;
3898                     s0->current_picture_ptr = NULL;
3899
3900                 } else {
3901                     /* Second field in complementary pair */
3902                     s0->first_field = 0;
3903                 }
3904             }
3905
3906         } else {
3907             /* Frame or first field in a potentially complementary pair */
3908             assert(!s0->current_picture_ptr);
3909             s0->first_field = FIELD_PICTURE;
3910         }
3911
3912         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3913             s0->first_field = 0;
3914             return -1;
3915         }
3916     }
3917     if(h != h0)
3918         clone_slice(h, h0);
3919
3920     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3921
3922     assert(s->mb_num == s->mb_width * s->mb_height);
3923     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3924        first_mb_in_slice                    >= s->mb_num){
3925         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3926         return -1;
3927     }
3928     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3929     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3930     if (s->picture_structure == PICT_BOTTOM_FIELD)
3931         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3932     assert(s->mb_y < s->mb_height);
3933
3934     if(s->picture_structure==PICT_FRAME){
3935         h->curr_pic_num=   h->frame_num;
3936         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3937     }else{
3938         h->curr_pic_num= 2*h->frame_num + 1;
3939         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3940     }
3941
3942     if(h->nal_unit_type == NAL_IDR_SLICE){
3943         get_ue_golomb(&s->gb); /* idr_pic_id */
3944     }
3945
3946     if(h->sps.poc_type==0){
3947         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3948
3949         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3950             h->delta_poc_bottom= get_se_golomb(&s->gb);
3951         }
3952     }
3953
3954     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3955         h->delta_poc[0]= get_se_golomb(&s->gb);
3956
3957         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3958             h->delta_poc[1]= get_se_golomb(&s->gb);
3959     }
3960
3961     init_poc(h);
3962
3963     if(h->pps.redundant_pic_cnt_present){
3964         h->redundant_pic_count= get_ue_golomb(&s->gb);
3965     }
3966
3967     //set defaults, might be overridden a few lines later
3968     h->ref_count[0]= h->pps.ref_count[0];
3969     h->ref_count[1]= h->pps.ref_count[1];
3970
3971     if(h->slice_type_nos != FF_I_TYPE){
3972         if(h->slice_type_nos == FF_B_TYPE){
3973             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3974         }
3975         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3976
3977         if(num_ref_idx_active_override_flag){
3978             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3979             if(h->slice_type_nos==FF_B_TYPE)
3980                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3981
3982             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3983                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3984                 h->ref_count[0]= h->ref_count[1]= 1;
3985                 return -1;
3986             }
3987         }
3988         if(h->slice_type_nos == FF_B_TYPE)
3989             h->list_count= 2;
3990         else
3991             h->list_count= 1;
3992     }else
3993         h->list_count= 0;
3994
3995     if(!default_ref_list_done){
3996         fill_default_ref_list(h);
3997     }
3998
3999     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
4000         return -1;
4001
4002     if(h->slice_type_nos!=FF_I_TYPE){
4003         s->last_picture_ptr= &h->ref_list[0][0];
4004         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
4005     }
4006     if(h->slice_type_nos==FF_B_TYPE){
4007         s->next_picture_ptr= &h->ref_list[1][0];
4008         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
4009     }
4010
4011     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4012        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4013         pred_weight_table(h);
4014     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4015         implicit_weight_table(h);
4016     else {
4017         h->use_weight = 0;
4018         for (i = 0; i < 2; i++) {
4019             h->luma_weight_flag[i]   = 0;
4020             h->chroma_weight_flag[i] = 0;
4021         }
4022     }
4023
4024     if(h->nal_ref_idc)
4025         decode_ref_pic_marking(h0, &s->gb);
4026
4027     if(FRAME_MBAFF)
4028         fill_mbaff_ref_list(h);
4029
4030     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
4031         direct_dist_scale_factor(h);
4032     direct_ref_list_init(h);
4033
4034     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4035         tmp = get_ue_golomb_31(&s->gb);
4036         if(tmp > 2){
4037             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4038             return -1;
4039         }
4040         h->cabac_init_idc= tmp;
4041     }
4042
4043     h->last_qscale_diff = 0;
4044     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4045     if(tmp>51){
4046         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4047         return -1;
4048     }
4049     s->qscale= tmp;
4050     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4051     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4052     //FIXME qscale / qp ... stuff
4053     if(h->slice_type == FF_SP_TYPE){
4054         get_bits1(&s->gb); /* sp_for_switch_flag */
4055     }
4056     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4057         get_se_golomb(&s->gb); /* slice_qs_delta */
4058     }
4059
4060     h->deblocking_filter = 1;
4061     h->slice_alpha_c0_offset = 0;
4062     h->slice_beta_offset = 0;
4063     if( h->pps.deblocking_filter_parameters_present ) {
4064         tmp= get_ue_golomb_31(&s->gb);
4065         if(tmp > 2){
4066             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4067             return -1;
4068         }
4069         h->deblocking_filter= tmp;
4070         if(h->deblocking_filter < 2)
4071             h->deblocking_filter^= 1; // 1<->0
4072
4073         if( h->deblocking_filter ) {
4074             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4075             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4076         }
4077     }
4078
4079     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4080        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4081        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4082        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4083         h->deblocking_filter= 0;
4084
4085     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4086         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4087             /* Cheat slightly for speed:
4088                Do not bother to deblock across slices. */
4089             h->deblocking_filter = 2;
4090         } else {
4091             h0->max_contexts = 1;
4092             if(!h0->single_decode_warning) {
4093                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4094                 h0->single_decode_warning = 1;
4095             }
4096             if(h != h0)
4097                 return 1; // deblocking switched inside frame
4098         }
4099     }
4100
4101 #if 0 //FMO
4102     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4103         slice_group_change_cycle= get_bits(&s->gb, ?);
4104 #endif
4105
4106     h0->last_slice_type = slice_type;
4107     h->slice_num = ++h0->current_slice;
4108     if(h->slice_num >= MAX_SLICES){
4109         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4110     }
4111
4112     for(j=0; j<2; j++){
4113         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4114         ref2frm[0]=
4115         ref2frm[1]= -1;
4116         for(i=0; i<16; i++)
4117             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4118                           +(h->ref_list[j][i].reference&3);
4119         ref2frm[18+0]=
4120         ref2frm[18+1]= -1;
4121         for(i=16; i<48; i++)
4122             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4123                           +(h->ref_list[j][i].reference&3);
4124     }
4125
4126     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4127     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4128
4129     s->avctx->refs= h->sps.ref_frame_count;
4130
4131     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4132         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4133                h->slice_num,
4134                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4135                first_mb_in_slice,
4136                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4137                pps_id, h->frame_num,
4138                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4139                h->ref_count[0], h->ref_count[1],
4140                s->qscale,
4141                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4142                h->use_weight,
4143                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4144                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4145                );
4146     }
4147
4148     return 0;
4149 }
4150
4151 /**
4152  *
4153  */
4154 static inline int get_level_prefix(GetBitContext *gb){
4155     unsigned int buf;
4156     int log;
4157
4158     OPEN_READER(re, gb);
4159     UPDATE_CACHE(re, gb);
4160     buf=GET_CACHE(re, gb);
4161
4162     log= 32 - av_log2(buf);
4163 #ifdef TRACE
4164     print_bin(buf>>(32-log), log);
4165     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4166 #endif
4167
4168     LAST_SKIP_BITS(re, gb, log);
4169     CLOSE_READER(re, gb);
4170
4171     return log-1;
4172 }
4173
4174 static inline int get_dct8x8_allowed(H264Context *h){
4175     if(h->sps.direct_8x8_inference_flag)
4176         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4177     else
4178         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4179 }
4180
4181 /**
4182  * decodes a residual block.
4183  * @param n block index
4184  * @param scantable scantable
4185  * @param max_coeff number of coefficients in the block
4186  * @return <0 if an error occurred
4187  */
4188 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4189     MpegEncContext * const s = &h->s;
4190     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4191     int level[16];
4192     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4193
4194     //FIXME put trailing_onex into the context
4195
4196     if(n == CHROMA_DC_BLOCK_INDEX){
4197         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4198         total_coeff= coeff_token>>2;
4199     }else{
4200         if(n == LUMA_DC_BLOCK_INDEX){
4201             total_coeff= pred_non_zero_count(h, 0);
4202             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4203             total_coeff= coeff_token>>2;
4204         }else{
4205             total_coeff= pred_non_zero_count(h, n);
4206             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4207             total_coeff= coeff_token>>2;
4208             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4209         }
4210     }
4211
4212     //FIXME set last_non_zero?
4213
4214     if(total_coeff==0)
4215         return 0;
4216     if(total_coeff > (unsigned)max_coeff) {
4217         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4218         return -1;
4219     }
4220
4221     trailing_ones= coeff_token&3;
4222     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4223     assert(total_coeff<=16);
4224
4225     i = show_bits(gb, 3);
4226     skip_bits(gb, trailing_ones);
4227     level[0] = 1-((i&4)>>1);
4228     level[1] = 1-((i&2)   );
4229     level[2] = 1-((i&1)<<1);
4230
4231     if(trailing_ones<total_coeff) {
4232         int mask, prefix;
4233         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4234         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4235         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4236
4237         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4238         if(level_code >= 100){
4239             prefix= level_code - 100;
4240             if(prefix == LEVEL_TAB_BITS)
4241                 prefix += get_level_prefix(gb);
4242
4243             //first coefficient has suffix_length equal to 0 or 1
4244             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4245                 if(suffix_length)
4246                     level_code= (prefix<<1) + get_bits1(gb); //part
4247                 else
4248                     level_code= prefix; //part
4249             }else if(prefix==14){
4250                 if(suffix_length)
4251                     level_code= (prefix<<1) + get_bits1(gb); //part
4252                 else
4253                     level_code= prefix + get_bits(gb, 4); //part
4254             }else{
4255                 level_code= 30 + get_bits(gb, prefix-3); //part
4256                 if(prefix>=16)
4257                     level_code += (1<<(prefix-3))-4096;
4258             }
4259
4260             if(trailing_ones < 3) level_code += 2;
4261
4262             suffix_length = 2;
4263             mask= -(level_code&1);
4264             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4265         }else{
4266             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4267
4268             suffix_length = 1;
4269             if(level_code + 3U > 6U)
4270                 suffix_length++;
4271             level[trailing_ones]= level_code;
4272         }
4273
4274         //remaining coefficients have suffix_length > 0
4275         for(i=trailing_ones+1;i<total_coeff;i++) {
4276             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4277             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4278             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4279
4280             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4281             if(level_code >= 100){
4282                 prefix= level_code - 100;
4283                 if(prefix == LEVEL_TAB_BITS){
4284                     prefix += get_level_prefix(gb);
4285                 }
4286                 if(prefix<15){
4287                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4288                 }else{
4289                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4290                     if(prefix>=16)
4291                         level_code += (1<<(prefix-3))-4096;
4292                 }
4293                 mask= -(level_code&1);
4294                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4295             }
4296             level[i]= level_code;
4297
4298             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4299                 suffix_length++;
4300         }
4301     }
4302
4303     if(total_coeff == max_coeff)
4304         zeros_left=0;
4305     else{
4306         if(n == CHROMA_DC_BLOCK_INDEX)
4307             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4308         else
4309             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4310     }
4311
4312     coeff_num = zeros_left + total_coeff - 1;
4313     j = scantable[coeff_num];
4314     if(n > 24){
4315         block[j] = level[0];
4316         for(i=1;i<total_coeff;i++) {
4317             if(zeros_left <= 0)
4318                 run_before = 0;
4319             else if(zeros_left < 7){
4320                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4321             }else{
4322                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4323             }
4324             zeros_left -= run_before;
4325             coeff_num -= 1 + run_before;
4326             j= scantable[ coeff_num ];
4327
4328             block[j]= level[i];
4329         }
4330     }else{
4331         block[j] = (level[0] * qmul[j] + 32)>>6;
4332         for(i=1;i<total_coeff;i++) {
4333             if(zeros_left <= 0)
4334                 run_before = 0;
4335             else if(zeros_left < 7){
4336                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4337             }else{
4338                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4339             }
4340             zeros_left -= run_before;
4341             coeff_num -= 1 + run_before;
4342             j= scantable[ coeff_num ];
4343
4344             block[j]= (level[i] * qmul[j] + 32)>>6;
4345         }
4346     }
4347
4348     if(zeros_left<0){
4349         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4350         return -1;
4351     }
4352
4353     return 0;
4354 }
4355
4356 static void predict_field_decoding_flag(H264Context *h){
4357     MpegEncContext * const s = &h->s;
4358     const int mb_xy= h->mb_xy;
4359     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4360                 ? s->current_picture.mb_type[mb_xy-1]
4361                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4362                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4363                 : 0;
4364     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4365 }
4366
4367 /**
4368  * decodes a P_SKIP or B_SKIP macroblock
4369  */
4370 static void decode_mb_skip(H264Context *h){
4371     MpegEncContext * const s = &h->s;
4372     const int mb_xy= h->mb_xy;
4373     int mb_type=0;
4374
4375     memset(h->non_zero_count[mb_xy], 0, 16);
4376     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4377
4378     if(MB_FIELD)
4379         mb_type|= MB_TYPE_INTERLACED;
4380
4381     if( h->slice_type_nos == FF_B_TYPE )
4382     {
4383         // just for fill_caches. pred_direct_motion will set the real mb_type
4384         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4385
4386         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4387         pred_direct_motion(h, &mb_type);
4388         mb_type|= MB_TYPE_SKIP;
4389     }
4390     else
4391     {
4392         int mx, my;
4393         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4394
4395         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4396         pred_pskip_motion(h, &mx, &my);
4397         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4398         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4399     }
4400
4401     write_back_motion(h, mb_type);
4402     s->current_picture.mb_type[mb_xy]= mb_type;
4403     s->current_picture.qscale_table[mb_xy]= s->qscale;
4404     h->slice_table[ mb_xy ]= h->slice_num;
4405     h->prev_mb_skipped= 1;
4406 }
4407
4408 /**
4409  * decodes a macroblock
4410  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4411  */
4412 static int decode_mb_cavlc(H264Context *h){
4413     MpegEncContext * const s = &h->s;
4414     int mb_xy;
4415     int partition_count;
4416     unsigned int mb_type, cbp;
4417     int dct8x8_allowed= h->pps.transform_8x8_mode;
4418
4419     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4420
4421     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4422     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4423                 down the code */
4424     if(h->slice_type_nos != FF_I_TYPE){
4425         if(s->mb_skip_run==-1)
4426             s->mb_skip_run= get_ue_golomb(&s->gb);
4427
4428         if (s->mb_skip_run--) {
4429             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4430                 if(s->mb_skip_run==0)
4431                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4432                 else
4433                     predict_field_decoding_flag(h);
4434             }
4435             decode_mb_skip(h);
4436             return 0;
4437         }
4438     }
4439     if(FRAME_MBAFF){
4440         if( (s->mb_y&1) == 0 )
4441             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4442     }
4443
4444     h->prev_mb_skipped= 0;
4445
4446     mb_type= get_ue_golomb(&s->gb);
4447     if(h->slice_type_nos == FF_B_TYPE){
4448         if(mb_type < 23){
4449             partition_count= b_mb_type_info[mb_type].partition_count;
4450             mb_type=         b_mb_type_info[mb_type].type;
4451         }else{
4452             mb_type -= 23;
4453             goto decode_intra_mb;
4454         }
4455     }else if(h->slice_type_nos == FF_P_TYPE){
4456         if(mb_type < 5){
4457             partition_count= p_mb_type_info[mb_type].partition_count;
4458             mb_type=         p_mb_type_info[mb_type].type;
4459         }else{
4460             mb_type -= 5;
4461             goto decode_intra_mb;
4462         }
4463     }else{
4464        assert(h->slice_type_nos == FF_I_TYPE);
4465         if(h->slice_type == FF_SI_TYPE && mb_type)
4466             mb_type--;
4467 decode_intra_mb:
4468         if(mb_type > 25){
4469             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4470             return -1;
4471         }
4472         partition_count=0;
4473         cbp= i_mb_type_info[mb_type].cbp;
4474         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4475         mb_type= i_mb_type_info[mb_type].type;
4476     }
4477
4478     if(MB_FIELD)
4479         mb_type |= MB_TYPE_INTERLACED;
4480
4481     h->slice_table[ mb_xy ]= h->slice_num;
4482
4483     if(IS_INTRA_PCM(mb_type)){
4484         unsigned int x;
4485
4486         // We assume these blocks are very rare so we do not optimize it.
4487         align_get_bits(&s->gb);
4488
4489         // The pixels are stored in the same order as levels in h->mb array.
4490         for(x=0; x < (CHROMA ? 384 : 256); x++){
4491             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4492         }
4493
4494         // In deblocking, the quantizer is 0
4495         s->current_picture.qscale_table[mb_xy]= 0;
4496         // All coeffs are present
4497         memset(h->non_zero_count[mb_xy], 16, 16);
4498
4499         s->current_picture.mb_type[mb_xy]= mb_type;
4500         return 0;
4501     }
4502
4503     if(MB_MBAFF){
4504         h->ref_count[0] <<= 1;
4505         h->ref_count[1] <<= 1;
4506     }
4507
4508     fill_caches(h, mb_type, 0);
4509
4510     //mb_pred
4511     if(IS_INTRA(mb_type)){
4512         int pred_mode;
4513 //            init_top_left_availability(h);
4514         if(IS_INTRA4x4(mb_type)){
4515             int i;
4516             int di = 1;
4517             if(dct8x8_allowed && get_bits1(&s->gb)){
4518                 mb_type |= MB_TYPE_8x8DCT;
4519                 di = 4;
4520             }
4521
4522 //                fill_intra4x4_pred_table(h);
4523             for(i=0; i<16; i+=di){
4524                 int mode= pred_intra_mode(h, i);
4525
4526                 if(!get_bits1(&s->gb)){
4527                     const int rem_mode= get_bits(&s->gb, 3);
4528                     mode = rem_mode + (rem_mode >= mode);
4529                 }
4530
4531                 if(di==4)
4532                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4533                 else
4534                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4535             }
4536             write_back_intra_pred_mode(h);
4537             if( check_intra4x4_pred_mode(h) < 0)
4538                 return -1;
4539         }else{
4540             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4541             if(h->intra16x16_pred_mode < 0)
4542                 return -1;
4543         }
4544         if(CHROMA){
4545             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4546             if(pred_mode < 0)
4547                 return -1;
4548             h->chroma_pred_mode= pred_mode;
4549         }
4550     }else if(partition_count==4){
4551         int i, j, sub_partition_count[4], list, ref[2][4];
4552
4553         if(h->slice_type_nos == FF_B_TYPE){
4554             for(i=0; i<4; i++){
4555                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4556                 if(h->sub_mb_type[i] >=13){
4557                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4558                     return -1;
4559                 }
4560                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4561                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4562             }
4563             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4564                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4565                 pred_direct_motion(h, &mb_type);
4566                 h->ref_cache[0][scan8[4]] =
4567                 h->ref_cache[1][scan8[4]] =
4568                 h->ref_cache[0][scan8[12]] =
4569                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4570             }
4571         }else{
4572             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4573             for(i=0; i<4; i++){
4574                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4575                 if(h->sub_mb_type[i] >=4){
4576                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4577                     return -1;
4578                 }
4579                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4580                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4581             }
4582         }
4583
4584         for(list=0; list<h->list_count; list++){
4585             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4586             for(i=0; i<4; i++){
4587                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4588                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4589                     unsigned int tmp;
4590                     if(ref_count == 1){
4591                         tmp= 0;
4592                     }else if(ref_count == 2){
4593                         tmp= get_bits1(&s->gb)^1;
4594                     }else{
4595                         tmp= get_ue_golomb_31(&s->gb);
4596                         if(tmp>=ref_count){
4597                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4598                             return -1;
4599                         }
4600                     }
4601                     ref[list][i]= tmp;
4602                 }else{
4603                  //FIXME
4604                     ref[list][i] = -1;
4605                 }
4606             }
4607         }
4608
4609         if(dct8x8_allowed)
4610             dct8x8_allowed = get_dct8x8_allowed(h);
4611
4612         for(list=0; list<h->list_count; list++){
4613             for(i=0; i<4; i++){
4614                 if(IS_DIRECT(h->sub_mb_type[i])) {
4615                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4616                     continue;
4617                 }
4618                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4619                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4620
4621                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4622                     const int sub_mb_type= h->sub_mb_type[i];
4623                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4624                     for(j=0; j<sub_partition_count[i]; j++){
4625                         int mx, my;
4626                         const int index= 4*i + block_width*j;
4627                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4628                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4629                         mx += get_se_golomb(&s->gb);
4630                         my += get_se_golomb(&s->gb);
4631                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4632
4633                         if(IS_SUB_8X8(sub_mb_type)){
4634                             mv_cache[ 1 ][0]=
4635                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4636                             mv_cache[ 1 ][1]=
4637                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4638                         }else if(IS_SUB_8X4(sub_mb_type)){
4639                             mv_cache[ 1 ][0]= mx;
4640                             mv_cache[ 1 ][1]= my;
4641                         }else if(IS_SUB_4X8(sub_mb_type)){
4642                             mv_cache[ 8 ][0]= mx;
4643                             mv_cache[ 8 ][1]= my;
4644                         }
4645                         mv_cache[ 0 ][0]= mx;
4646                         mv_cache[ 0 ][1]= my;
4647                     }
4648                 }else{
4649                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4650                     p[0] = p[1]=
4651                     p[8] = p[9]= 0;
4652                 }
4653             }
4654         }
4655     }else if(IS_DIRECT(mb_type)){
4656         pred_direct_motion(h, &mb_type);
4657         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4658     }else{
4659         int list, mx, my, i;
4660          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4661         if(IS_16X16(mb_type)){
4662             for(list=0; list<h->list_count; list++){
4663                     unsigned int val;
4664                     if(IS_DIR(mb_type, 0, list)){
4665                         if(h->ref_count[list]==1){
4666                             val= 0;
4667                         }else if(h->ref_count[list]==2){
4668                             val= get_bits1(&s->gb)^1;
4669                         }else{
4670                             val= get_ue_golomb_31(&s->gb);
4671                             if(val >= h->ref_count[list]){
4672                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4673                                 return -1;
4674                             }
4675                         }
4676                     }else
4677                         val= LIST_NOT_USED&0xFF;
4678                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4679             }
4680             for(list=0; list<h->list_count; list++){
4681                 unsigned int val;
4682                 if(IS_DIR(mb_type, 0, list)){
4683                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4684                     mx += get_se_golomb(&s->gb);
4685                     my += get_se_golomb(&s->gb);
4686                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4687
4688                     val= pack16to32(mx,my);
4689                 }else
4690                     val=0;
4691                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4692             }
4693         }
4694         else if(IS_16X8(mb_type)){
4695             for(list=0; list<h->list_count; list++){
4696                     for(i=0; i<2; i++){
4697                         unsigned int val;
4698                         if(IS_DIR(mb_type, i, list)){
4699                             if(h->ref_count[list] == 1){
4700                                 val= 0;
4701                             }else if(h->ref_count[list] == 2){
4702                                 val= get_bits1(&s->gb)^1;
4703                             }else{
4704                                 val= get_ue_golomb_31(&s->gb);
4705                                 if(val >= h->ref_count[list]){
4706                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4707                                     return -1;
4708                                 }
4709                             }
4710                         }else
4711                             val= LIST_NOT_USED&0xFF;
4712                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4713                     }
4714             }
4715             for(list=0; list<h->list_count; list++){
4716                 for(i=0; i<2; i++){
4717                     unsigned int val;
4718                     if(IS_DIR(mb_type, i, list)){
4719                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4720                         mx += get_se_golomb(&s->gb);
4721                         my += get_se_golomb(&s->gb);
4722                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4723
4724                         val= pack16to32(mx,my);
4725                     }else
4726                         val=0;
4727                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4728                 }
4729             }
4730         }else{
4731             assert(IS_8X16(mb_type));
4732             for(list=0; list<h->list_count; list++){
4733                     for(i=0; i<2; i++){
4734                         unsigned int val;
4735                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4736                             if(h->ref_count[list]==1){
4737                                 val= 0;
4738                             }else if(h->ref_count[list]==2){
4739                                 val= get_bits1(&s->gb)^1;
4740                             }else{
4741                                 val= get_ue_golomb_31(&s->gb);
4742                                 if(val >= h->ref_count[list]){
4743                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4744                                     return -1;
4745                                 }
4746                             }
4747                         }else
4748                             val= LIST_NOT_USED&0xFF;
4749                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4750                     }
4751             }
4752             for(list=0; list<h->list_count; list++){
4753                 for(i=0; i<2; i++){
4754                     unsigned int val;
4755                     if(IS_DIR(mb_type, i, list)){
4756                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4757                         mx += get_se_golomb(&s->gb);
4758                         my += get_se_golomb(&s->gb);
4759                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4760
4761                         val= pack16to32(mx,my);
4762                     }else
4763                         val=0;
4764                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4765                 }
4766             }
4767         }
4768     }
4769
4770     if(IS_INTER(mb_type))
4771         write_back_motion(h, mb_type);
4772
4773     if(!IS_INTRA16x16(mb_type)){
4774         cbp= get_ue_golomb(&s->gb);
4775         if(cbp > 47){
4776             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4777             return -1;
4778         }
4779
4780         if(CHROMA){
4781             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4782             else                     cbp= golomb_to_inter_cbp   [cbp];
4783         }else{
4784             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4785             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4786         }
4787     }
4788     h->cbp = cbp;
4789
4790     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4791         if(get_bits1(&s->gb)){
4792             mb_type |= MB_TYPE_8x8DCT;
4793             h->cbp_table[mb_xy]= cbp;
4794         }
4795     }
4796     s->current_picture.mb_type[mb_xy]= mb_type;
4797
4798     if(cbp || IS_INTRA16x16(mb_type)){
4799         int i8x8, i4x4, chroma_idx;
4800         int dquant;
4801         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4802         const uint8_t *scan, *scan8x8, *dc_scan;
4803
4804 //        fill_non_zero_count_cache(h);
4805
4806         if(IS_INTERLACED(mb_type)){
4807             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4808             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4809             dc_scan= luma_dc_field_scan;
4810         }else{
4811             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4812             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4813             dc_scan= luma_dc_zigzag_scan;
4814         }
4815
4816         dquant= get_se_golomb(&s->gb);
4817
4818         if( dquant > 25 || dquant < -26 ){
4819             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4820             return -1;
4821         }
4822
4823         s->qscale += dquant;
4824         if(((unsigned)s->qscale) > 51){
4825             if(s->qscale<0) s->qscale+= 52;
4826             else            s->qscale-= 52;
4827         }
4828
4829         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4830         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4831         if(IS_INTRA16x16(mb_type)){
4832             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4833                 return -1; //FIXME continue if partitioned and other return -1 too
4834             }
4835
4836             assert((cbp&15) == 0 || (cbp&15) == 15);
4837
4838             if(cbp&15){
4839                 for(i8x8=0; i8x8<4; i8x8++){
4840                     for(i4x4=0; i4x4<4; i4x4++){
4841                         const int index= i4x4 + 4*i8x8;
4842                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4843                             return -1;
4844                         }
4845                     }
4846                 }
4847             }else{
4848                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4849             }
4850         }else{
4851             for(i8x8=0; i8x8<4; i8x8++){
4852                 if(cbp & (1<<i8x8)){
4853                     if(IS_8x8DCT(mb_type)){
4854                         DCTELEM *buf = &h->mb[64*i8x8];
4855                         uint8_t *nnz;
4856                         for(i4x4=0; i4x4<4; i4x4++){
4857                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4858                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4859                                 return -1;
4860                         }
4861                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4862                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4863                     }else{
4864                         for(i4x4=0; i4x4<4; i4x4++){
4865                             const int index= i4x4 + 4*i8x8;
4866
4867                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4868                                 return -1;
4869                             }
4870                         }
4871                     }
4872                 }else{
4873                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4874                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4875                 }
4876             }
4877         }
4878
4879         if(cbp&0x30){
4880             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4881                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4882                     return -1;
4883                 }
4884         }
4885
4886         if(cbp&0x20){
4887             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4888                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4889                 for(i4x4=0; i4x4<4; i4x4++){
4890                     const int index= 16 + 4*chroma_idx + i4x4;
4891                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4892                         return -1;
4893                     }
4894                 }
4895             }
4896         }else{
4897             uint8_t * const nnz= &h->non_zero_count_cache[0];
4898             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4899             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4900         }
4901     }else{
4902         uint8_t * const nnz= &h->non_zero_count_cache[0];
4903         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4904         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4905         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4906     }
4907     s->current_picture.qscale_table[mb_xy]= s->qscale;
4908     write_back_non_zero_count(h);
4909
4910     if(MB_MBAFF){
4911         h->ref_count[0] >>= 1;
4912         h->ref_count[1] >>= 1;
4913     }
4914
4915     return 0;
4916 }
4917
4918 static int decode_cabac_field_decoding_flag(H264Context *h) {
4919     MpegEncContext * const s = &h->s;
4920     const int mb_x = s->mb_x;
4921     const int mb_y = s->mb_y & ~1;
4922     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4923     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4924
4925     unsigned int ctx = 0;
4926
4927     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4928         ctx += 1;
4929     }
4930     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4931         ctx += 1;
4932     }
4933
4934     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4935 }
4936
4937 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4938     uint8_t *state= &h->cabac_state[ctx_base];
4939     int mb_type;
4940
4941     if(intra_slice){
4942         MpegEncContext * const s = &h->s;
4943         const int mba_xy = h->left_mb_xy[0];
4944         const int mbb_xy = h->top_mb_xy;
4945         int ctx=0;
4946         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4947             ctx++;
4948         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4949             ctx++;
4950         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4951             return 0;   /* I4x4 */
4952         state += 2;
4953     }else{
4954         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4955             return 0;   /* I4x4 */
4956     }
4957
4958     if( get_cabac_terminate( &h->cabac ) )
4959         return 25;  /* PCM */
4960
4961     mb_type = 1; /* I16x16 */
4962     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4963     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4964         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4965     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4966     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4967     return mb_type;
4968 }
4969
4970 static int decode_cabac_mb_type_b( H264Context *h ) {
4971     MpegEncContext * const s = &h->s;
4972
4973         const int mba_xy = h->left_mb_xy[0];
4974         const int mbb_xy = h->top_mb_xy;
4975         int ctx = 0;
4976         int bits;
4977         assert(h->slice_type_nos == FF_B_TYPE);
4978
4979         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4980             ctx++;
4981         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4982             ctx++;
4983
4984         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4985             return 0; /* B_Direct_16x16 */
4986
4987         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4988             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4989         }
4990
4991         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4992         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4993         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4994         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4995         if( bits < 8 )
4996             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4997         else if( bits == 13 ) {
4998             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4999         } else if( bits == 14 )
5000             return 11; /* B_L1_L0_8x16 */
5001         else if( bits == 15 )
5002             return 22; /* B_8x8 */
5003
5004         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5005         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5006 }
5007
5008 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5009     MpegEncContext * const s = &h->s;
5010     int mba_xy, mbb_xy;
5011     int ctx = 0;
5012
5013     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5014         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5015         mba_xy = mb_xy - 1;
5016         if( (mb_y&1)
5017             && h->slice_table[mba_xy] == h->slice_num
5018             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5019             mba_xy += s->mb_stride;
5020         if( MB_FIELD ){
5021             mbb_xy = mb_xy - s->mb_stride;
5022             if( !(mb_y&1)
5023                 && h->slice_table[mbb_xy] == h->slice_num
5024                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5025                 mbb_xy -= s->mb_stride;
5026         }else
5027             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5028     }else{
5029         int mb_xy = h->mb_xy;
5030         mba_xy = mb_xy - 1;
5031         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5032     }
5033
5034     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5035         ctx++;
5036     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5037         ctx++;
5038
5039     if( h->slice_type_nos == FF_B_TYPE )
5040         ctx += 13;
5041     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5042 }
5043
5044 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5045     int mode = 0;
5046
5047     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5048         return pred_mode;
5049
5050     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5051     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5052     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5053
5054     if( mode >= pred_mode )
5055         return mode + 1;
5056     else
5057         return mode;
5058 }
5059
5060 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5061     const int mba_xy = h->left_mb_xy[0];
5062     const int mbb_xy = h->top_mb_xy;
5063
5064     int ctx = 0;
5065
5066     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5067     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5068         ctx++;
5069
5070     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5071         ctx++;
5072
5073     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5074         return 0;
5075
5076     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5077         return 1;
5078     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5079         return 2;
5080     else
5081         return 3;
5082 }
5083
5084 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5085     int cbp_b, cbp_a, ctx, cbp = 0;
5086
5087     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5088     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5089
5090     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5091     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5092     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5093     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5094     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5095     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5096     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5097     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5098     return cbp;
5099 }
5100 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5101     int ctx;
5102     int cbp_a, cbp_b;
5103
5104     cbp_a = (h->left_cbp>>4)&0x03;
5105     cbp_b = (h-> top_cbp>>4)&0x03;
5106
5107     ctx = 0;
5108     if( cbp_a > 0 ) ctx++;
5109     if( cbp_b > 0 ) ctx += 2;
5110     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5111         return 0;
5112
5113     ctx = 4;
5114     if( cbp_a == 2 ) ctx++;
5115     if( cbp_b == 2 ) ctx += 2;
5116     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5117 }
5118 static int decode_cabac_mb_dqp( H264Context *h) {
5119     int   ctx= h->last_qscale_diff != 0;
5120     int   val = 0;
5121
5122     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5123         ctx= 2+(ctx>>1);
5124         val++;
5125         if(val > 102) //prevent infinite loop
5126             return INT_MIN;
5127     }
5128
5129     if( val&0x01 )
5130         return   (val + 1)>>1 ;
5131     else
5132         return -((val + 1)>>1);
5133 }
5134 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5135     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5136         return 0;   /* 8x8 */
5137     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5138         return 1;   /* 8x4 */
5139     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5140         return 2;   /* 4x8 */
5141     return 3;       /* 4x4 */
5142 }
5143 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5144     int type;
5145     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5146         return 0;   /* B_Direct_8x8 */
5147     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5148         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5149     type = 3;
5150     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5151         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5152             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5153         type += 4;
5154     }
5155     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5156     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5157     return type;
5158 }
5159
5160 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5161     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5162 }
5163
5164 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5165     int refa = h->ref_cache[list][scan8[n] - 1];
5166     int refb = h->ref_cache[list][scan8[n] - 8];
5167     int ref  = 0;
5168     int ctx  = 0;
5169
5170     if( h->slice_type_nos == FF_B_TYPE) {
5171         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5172             ctx++;
5173         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5174             ctx += 2;
5175     } else {
5176         if( refa > 0 )
5177             ctx++;
5178         if( refb > 0 )
5179             ctx += 2;
5180     }
5181
5182     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5183         ref++;
5184         ctx = (ctx>>2)+4;
5185         if(ref >= 32 /*h->ref_list[list]*/){
5186             return -1;
5187         }
5188     }
5189     return ref;
5190 }
5191
5192 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5193     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5194                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5195     int ctxbase = (l == 0) ? 40 : 47;
5196     int mvd;
5197     int ctx = (amvd>2) + (amvd>32);
5198
5199     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5200         return 0;
5201
5202     mvd= 1;
5203     ctx= 3;
5204     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5205         mvd++;
5206         if( ctx < 6 )
5207             ctx++;
5208     }
5209
5210     if( mvd >= 9 ) {
5211         int k = 3;
5212         while( get_cabac_bypass( &h->cabac ) ) {
5213             mvd += 1 << k;
5214             k++;
5215             if(k>24){
5216                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5217                 return INT_MIN;
5218             }
5219         }
5220         while( k-- ) {
5221             if( get_cabac_bypass( &h->cabac ) )
5222                 mvd += 1 << k;
5223         }
5224     }
5225     return get_cabac_bypass_sign( &h->cabac, -mvd );
5226 }
5227
5228 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5229     int nza, nzb;
5230     int ctx = 0;
5231
5232     if( is_dc ) {
5233         if( cat == 0 ) {
5234             nza = h->left_cbp&0x100;
5235             nzb = h-> top_cbp&0x100;
5236         } else {
5237             nza = (h->left_cbp>>(6+idx))&0x01;
5238             nzb = (h-> top_cbp>>(6+idx))&0x01;
5239         }
5240     } else {
5241         assert(cat == 1 || cat == 2 || cat == 4);
5242         nza = h->non_zero_count_cache[scan8[idx] - 1];
5243         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5244     }
5245
5246     if( nza > 0 )
5247         ctx++;
5248
5249     if( nzb > 0 )
5250         ctx += 2;
5251
5252     return ctx + 4 * cat;
5253 }
5254
5255 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5256     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5257     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5258     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5259     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5260 };
5261
5262 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5263     static const int significant_coeff_flag_offset[2][6] = {
5264       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5265       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5266     };
5267     static const int last_coeff_flag_offset[2][6] = {
5268       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5269       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5270     };
5271     static const int coeff_abs_level_m1_offset[6] = {
5272         227+0, 227+10, 227+20, 227+30, 227+39, 426
5273     };
5274     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5275       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5276         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5277         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5278        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5279       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5280         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5281         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5282         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5283     };
5284     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5285      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5286      * map node ctx => cabac ctx for level=1 */
5287     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5288     /* map node ctx => cabac ctx for level>1 */
5289     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5290     static const uint8_t coeff_abs_level_transition[2][8] = {
5291     /* update node ctx after decoding a level=1 */
5292         { 1, 2, 3, 3, 4, 5, 6, 7 },
5293     /* update node ctx after decoding a level>1 */
5294         { 4, 4, 4, 4, 5, 6, 7, 7 }
5295     };
5296
5297     int index[64];
5298
5299     int av_unused last;
5300     int coeff_count = 0;
5301     int node_ctx = 0;
5302
5303     uint8_t *significant_coeff_ctx_base;
5304     uint8_t *last_coeff_ctx_base;
5305     uint8_t *abs_level_m1_ctx_base;
5306
5307 #if !ARCH_X86
5308 #define CABAC_ON_STACK
5309 #endif
5310 #ifdef CABAC_ON_STACK
5311 #define CC &cc
5312     CABACContext cc;
5313     cc.range     = h->cabac.range;
5314     cc.low       = h->cabac.low;
5315     cc.bytestream= h->cabac.bytestream;
5316 #else
5317 #define CC &h->cabac
5318 #endif
5319
5320
5321     /* cat: 0-> DC 16x16  n = 0
5322      *      1-> AC 16x16  n = luma4x4idx
5323      *      2-> Luma4x4   n = luma4x4idx
5324      *      3-> DC Chroma n = iCbCr
5325      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5326      *      5-> Luma8x8   n = 4 * luma8x8idx
5327      */
5328
5329     /* read coded block flag */
5330     if( is_dc || cat != 5 ) {
5331         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5332             if( !is_dc )
5333                 h->non_zero_count_cache[scan8[n]] = 0;
5334
5335 #ifdef CABAC_ON_STACK
5336             h->cabac.range     = cc.range     ;
5337             h->cabac.low       = cc.low       ;
5338             h->cabac.bytestream= cc.bytestream;
5339 #endif
5340             return;
5341         }
5342     }
5343
5344     significant_coeff_ctx_base = h->cabac_state
5345         + significant_coeff_flag_offset[MB_FIELD][cat];
5346     last_coeff_ctx_base = h->cabac_state
5347         + last_coeff_flag_offset[MB_FIELD][cat];
5348     abs_level_m1_ctx_base = h->cabac_state
5349         + coeff_abs_level_m1_offset[cat];
5350
5351     if( !is_dc && cat == 5 ) {
5352 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5353         for(last= 0; last < coefs; last++) { \
5354             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5355             if( get_cabac( CC, sig_ctx )) { \
5356                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5357                 index[coeff_count++] = last; \
5358                 if( get_cabac( CC, last_ctx ) ) { \
5359                     last= max_coeff; \
5360                     break; \
5361                 } \
5362             } \
5363         }\
5364         if( last == max_coeff -1 ) {\
5365             index[coeff_count++] = last;\
5366         }
5367         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5368 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5369         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5370     } else {
5371         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5372 #else
5373         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5374     } else {
5375         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5376 #endif
5377     }
5378     assert(coeff_count > 0);
5379
5380     if( is_dc ) {
5381         if( cat == 0 )
5382             h->cbp_table[h->mb_xy] |= 0x100;
5383         else
5384             h->cbp_table[h->mb_xy] |= 0x40 << n;
5385     } else {
5386         if( cat == 5 )
5387             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5388         else {
5389             assert( cat == 1 || cat == 2 || cat == 4 );
5390             h->non_zero_count_cache[scan8[n]] = coeff_count;
5391         }
5392     }
5393
5394     do {
5395         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5396
5397         int j= scantable[index[--coeff_count]];
5398
5399         if( get_cabac( CC, ctx ) == 0 ) {
5400             node_ctx = coeff_abs_level_transition[0][node_ctx];
5401             if( is_dc ) {
5402                 block[j] = get_cabac_bypass_sign( CC, -1);
5403             }else{
5404                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5405             }
5406         } else {
5407             int coeff_abs = 2;
5408             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5409             node_ctx = coeff_abs_level_transition[1][node_ctx];
5410
5411             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5412                 coeff_abs++;
5413             }
5414
5415             if( coeff_abs >= 15 ) {
5416                 int j = 0;
5417                 while( get_cabac_bypass( CC ) ) {
5418                     j++;
5419                 }
5420
5421                 coeff_abs=1;
5422                 while( j-- ) {
5423                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5424                 }
5425                 coeff_abs+= 14;
5426             }
5427
5428             if( is_dc ) {
5429                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5430             }else{
5431                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5432             }
5433         }
5434     } while( coeff_count );
5435 #ifdef CABAC_ON_STACK
5436             h->cabac.range     = cc.range     ;
5437             h->cabac.low       = cc.low       ;
5438             h->cabac.bytestream= cc.bytestream;
5439 #endif
5440
5441 }
5442
5443 #if !CONFIG_SMALL
5444 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5445     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5446 }
5447
5448 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5449     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5450 }
5451 #endif
5452
5453 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5454 #if CONFIG_SMALL
5455     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5456 #else
5457     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5458     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5459 #endif
5460 }
5461
5462 static inline void compute_mb_neighbors(H264Context *h)
5463 {
5464     MpegEncContext * const s = &h->s;
5465     const int mb_xy  = h->mb_xy;
5466     h->top_mb_xy     = mb_xy - s->mb_stride;
5467     h->left_mb_xy[0] = mb_xy - 1;
5468     if(FRAME_MBAFF){
5469         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5470         const int top_pair_xy      = pair_xy     - s->mb_stride;
5471         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5472         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5473         const int curr_mb_field_flag = MB_FIELD;
5474         const int bottom = (s->mb_y & 1);
5475
5476         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5477             h->top_mb_xy -= s->mb_stride;
5478         }
5479         if (!left_mb_field_flag == curr_mb_field_flag) {
5480             h->left_mb_xy[0] = pair_xy - 1;
5481         }
5482     } else if (FIELD_PICTURE) {
5483         h->top_mb_xy -= s->mb_stride;
5484     }
5485     return;
5486 }
5487
5488 /**
5489  * decodes a macroblock
5490  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5491  */
5492 static int decode_mb_cabac(H264Context *h) {
5493     MpegEncContext * const s = &h->s;
5494     int mb_xy;
5495     int mb_type, partition_count, cbp = 0;
5496     int dct8x8_allowed= h->pps.transform_8x8_mode;
5497
5498     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5499
5500     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5501     if( h->slice_type_nos != FF_I_TYPE ) {
5502         int skip;
5503         /* a skipped mb needs the aff flag from the following mb */
5504         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5505             predict_field_decoding_flag(h);
5506         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5507             skip = h->next_mb_skipped;
5508         else
5509             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5510         /* read skip flags */
5511         if( skip ) {
5512             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5513                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5514                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5515                 if(!h->next_mb_skipped)
5516                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5517             }
5518
5519             decode_mb_skip(h);
5520
5521             h->cbp_table[mb_xy] = 0;
5522             h->chroma_pred_mode_table[mb_xy] = 0;
5523             h->last_qscale_diff = 0;
5524
5525             return 0;
5526
5527         }
5528     }
5529     if(FRAME_MBAFF){
5530         if( (s->mb_y&1) == 0 )
5531             h->mb_mbaff =
5532             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5533     }
5534
5535     h->prev_mb_skipped = 0;
5536
5537     compute_mb_neighbors(h);
5538
5539     if( h->slice_type_nos == FF_B_TYPE ) {
5540         mb_type = decode_cabac_mb_type_b( h );
5541         if( mb_type < 23 ){
5542             partition_count= b_mb_type_info[mb_type].partition_count;
5543             mb_type=         b_mb_type_info[mb_type].type;
5544         }else{
5545             mb_type -= 23;
5546             goto decode_intra_mb;
5547         }
5548     } else if( h->slice_type_nos == FF_P_TYPE ) {
5549         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5550             /* P-type */
5551             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5552                 /* P_L0_D16x16, P_8x8 */
5553                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5554             } else {
5555                 /* P_L0_D8x16, P_L0_D16x8 */
5556                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5557             }
5558             partition_count= p_mb_type_info[mb_type].partition_count;
5559             mb_type=         p_mb_type_info[mb_type].type;
5560         } else {
5561             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5562             goto decode_intra_mb;
5563         }
5564     } else {
5565         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5566         if(h->slice_type == FF_SI_TYPE && mb_type)
5567             mb_type--;
5568         assert(h->slice_type_nos == FF_I_TYPE);
5569 decode_intra_mb:
5570         partition_count = 0;
5571         cbp= i_mb_type_info[mb_type].cbp;
5572         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5573         mb_type= i_mb_type_info[mb_type].type;
5574     }
5575     if(MB_FIELD)
5576         mb_type |= MB_TYPE_INTERLACED;
5577
5578     h->slice_table[ mb_xy ]= h->slice_num;
5579
5580     if(IS_INTRA_PCM(mb_type)) {
5581         const uint8_t *ptr;
5582
5583         // We assume these blocks are very rare so we do not optimize it.
5584         // FIXME The two following lines get the bitstream position in the cabac
5585         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5586         ptr= h->cabac.bytestream;
5587         if(h->cabac.low&0x1) ptr--;
5588         if(CABAC_BITS==16){
5589             if(h->cabac.low&0x1FF) ptr--;
5590         }
5591
5592         // The pixels are stored in the same order as levels in h->mb array.
5593         memcpy(h->mb, ptr, 256); ptr+=256;
5594         if(CHROMA){
5595             memcpy(h->mb+128, ptr, 128); ptr+=128;
5596         }
5597
5598         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5599
5600         // All blocks are present
5601         h->cbp_table[mb_xy] = 0x1ef;
5602         h->chroma_pred_mode_table[mb_xy] = 0;
5603         // In deblocking, the quantizer is 0
5604         s->current_picture.qscale_table[mb_xy]= 0;
5605         // All coeffs are present
5606         memset(h->non_zero_count[mb_xy], 16, 16);
5607         s->current_picture.mb_type[mb_xy]= mb_type;
5608         h->last_qscale_diff = 0;
5609         return 0;
5610     }
5611
5612     if(MB_MBAFF){
5613         h->ref_count[0] <<= 1;
5614         h->ref_count[1] <<= 1;
5615     }
5616
5617     fill_caches(h, mb_type, 0);
5618
5619     if( IS_INTRA( mb_type ) ) {
5620         int i, pred_mode;
5621         if( IS_INTRA4x4( mb_type ) ) {
5622             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5623                 mb_type |= MB_TYPE_8x8DCT;
5624                 for( i = 0; i < 16; i+=4 ) {
5625                     int pred = pred_intra_mode( h, i );
5626                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5627                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5628                 }
5629             } else {
5630                 for( i = 0; i < 16; i++ ) {
5631                     int pred = pred_intra_mode( h, i );
5632                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5633
5634                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5635                 }
5636             }
5637             write_back_intra_pred_mode(h);
5638             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5639         } else {
5640             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5641             if( h->intra16x16_pred_mode < 0 ) return -1;
5642         }
5643         if(CHROMA){
5644             h->chroma_pred_mode_table[mb_xy] =
5645             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5646
5647             pred_mode= check_intra_pred_mode( h, pred_mode );
5648             if( pred_mode < 0 ) return -1;
5649             h->chroma_pred_mode= pred_mode;
5650         }
5651     } else if( partition_count == 4 ) {
5652         int i, j, sub_partition_count[4], list, ref[2][4];
5653
5654         if( h->slice_type_nos == FF_B_TYPE ) {
5655             for( i = 0; i < 4; i++ ) {
5656                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5657                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5658                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5659             }
5660             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5661                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5662                 pred_direct_motion(h, &mb_type);
5663                 h->ref_cache[0][scan8[4]] =
5664                 h->ref_cache[1][scan8[4]] =
5665                 h->ref_cache[0][scan8[12]] =
5666                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5667                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5668                     for( i = 0; i < 4; i++ )
5669                         if( IS_DIRECT(h->sub_mb_type[i]) )
5670                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5671                 }
5672             }
5673         } else {
5674             for( i = 0; i < 4; i++ ) {
5675                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5676                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5677                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5678             }
5679         }
5680
5681         for( list = 0; list < h->list_count; list++ ) {
5682                 for( i = 0; i < 4; i++ ) {
5683                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5684                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5685                         if( h->ref_count[list] > 1 ){
5686                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5687                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5688                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5689                                 return -1;
5690                             }
5691                         }else
5692                             ref[list][i] = 0;
5693                     } else {
5694                         ref[list][i] = -1;
5695                     }
5696                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5697                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5698                 }
5699         }
5700
5701         if(dct8x8_allowed)
5702             dct8x8_allowed = get_dct8x8_allowed(h);
5703
5704         for(list=0; list<h->list_count; list++){
5705             for(i=0; i<4; i++){
5706                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5707                 if(IS_DIRECT(h->sub_mb_type[i])){
5708                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5709                     continue;
5710                 }
5711
5712                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5713                     const int sub_mb_type= h->sub_mb_type[i];
5714                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5715                     for(j=0; j<sub_partition_count[i]; j++){
5716                         int mpx, mpy;
5717                         int mx, my;
5718                         const int index= 4*i + block_width*j;
5719                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5720                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5721                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5722
5723                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5724                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5725                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5726
5727                         if(IS_SUB_8X8(sub_mb_type)){
5728                             mv_cache[ 1 ][0]=
5729                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5730                             mv_cache[ 1 ][1]=
5731                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5732
5733                             mvd_cache[ 1 ][0]=
5734                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5735                             mvd_cache[ 1 ][1]=
5736                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5737                         }else if(IS_SUB_8X4(sub_mb_type)){
5738                             mv_cache[ 1 ][0]= mx;
5739                             mv_cache[ 1 ][1]= my;
5740
5741                             mvd_cache[ 1 ][0]= mx - mpx;
5742                             mvd_cache[ 1 ][1]= my - mpy;
5743                         }else if(IS_SUB_4X8(sub_mb_type)){
5744                             mv_cache[ 8 ][0]= mx;
5745                             mv_cache[ 8 ][1]= my;
5746
5747                             mvd_cache[ 8 ][0]= mx - mpx;
5748                             mvd_cache[ 8 ][1]= my - mpy;
5749                         }
5750                         mv_cache[ 0 ][0]= mx;
5751                         mv_cache[ 0 ][1]= my;
5752
5753                         mvd_cache[ 0 ][0]= mx - mpx;
5754                         mvd_cache[ 0 ][1]= my - mpy;
5755                     }
5756                 }else{
5757                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5758                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5759                     p[0] = p[1] = p[8] = p[9] = 0;
5760                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5761                 }
5762             }
5763         }
5764     } else if( IS_DIRECT(mb_type) ) {
5765         pred_direct_motion(h, &mb_type);
5766         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5767         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5768         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5769     } else {
5770         int list, mx, my, i, mpx, mpy;
5771         if(IS_16X16(mb_type)){
5772             for(list=0; list<h->list_count; list++){
5773                 if(IS_DIR(mb_type, 0, list)){
5774                     int ref;
5775                     if(h->ref_count[list] > 1){
5776                         ref= decode_cabac_mb_ref(h, list, 0);
5777                         if(ref >= (unsigned)h->ref_count[list]){
5778                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5779                             return -1;
5780                         }
5781                     }else
5782                         ref=0;
5783                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5784                 }else
5785                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5786             }
5787             for(list=0; list<h->list_count; list++){
5788                 if(IS_DIR(mb_type, 0, list)){
5789                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5790
5791                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5792                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5793                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5794
5795                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5796                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5797                 }else
5798                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5799             }
5800         }
5801         else if(IS_16X8(mb_type)){
5802             for(list=0; list<h->list_count; list++){
5803                     for(i=0; i<2; i++){
5804                         if(IS_DIR(mb_type, i, list)){
5805                             int ref;
5806                             if(h->ref_count[list] > 1){
5807                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5808                                 if(ref >= (unsigned)h->ref_count[list]){
5809                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5810                                     return -1;
5811                                 }
5812                             }else
5813                                 ref=0;
5814                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5815                         }else
5816                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5817                     }
5818             }
5819             for(list=0; list<h->list_count; list++){
5820                 for(i=0; i<2; i++){
5821                     if(IS_DIR(mb_type, i, list)){
5822                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5823                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5824                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5825                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5826
5827                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5828                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5829                     }else{
5830                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5831                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5832                     }
5833                 }
5834             }
5835         }else{
5836             assert(IS_8X16(mb_type));
5837             for(list=0; list<h->list_count; list++){
5838                     for(i=0; i<2; i++){
5839                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5840                             int ref;
5841                             if(h->ref_count[list] > 1){
5842                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5843                                 if(ref >= (unsigned)h->ref_count[list]){
5844                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5845                                     return -1;
5846                                 }
5847                             }else
5848                                 ref=0;
5849                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5850                         }else
5851                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5852                     }
5853             }
5854             for(list=0; list<h->list_count; list++){
5855                 for(i=0; i<2; i++){
5856                     if(IS_DIR(mb_type, i, list)){
5857                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5858                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5859                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5860
5861                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5862                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5863                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5864                     }else{
5865                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5866                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5867                     }
5868                 }
5869             }
5870         }
5871     }
5872
5873    if( IS_INTER( mb_type ) ) {
5874         h->chroma_pred_mode_table[mb_xy] = 0;
5875         write_back_motion( h, mb_type );
5876    }
5877
5878     if( !IS_INTRA16x16( mb_type ) ) {
5879         cbp  = decode_cabac_mb_cbp_luma( h );
5880         if(CHROMA)
5881             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5882     }
5883
5884     h->cbp_table[mb_xy] = h->cbp = cbp;
5885
5886     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5887         if( decode_cabac_mb_transform_size( h ) )
5888             mb_type |= MB_TYPE_8x8DCT;
5889     }
5890     s->current_picture.mb_type[mb_xy]= mb_type;
5891
5892     if( cbp || IS_INTRA16x16( mb_type ) ) {
5893         const uint8_t *scan, *scan8x8, *dc_scan;
5894         const uint32_t *qmul;
5895         int dqp;
5896
5897         if(IS_INTERLACED(mb_type)){
5898             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5899             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5900             dc_scan= luma_dc_field_scan;
5901         }else{
5902             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5903             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5904             dc_scan= luma_dc_zigzag_scan;
5905         }
5906
5907         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5908         if( dqp == INT_MIN ){
5909             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5910             return -1;
5911         }
5912         s->qscale += dqp;
5913         if(((unsigned)s->qscale) > 51){
5914             if(s->qscale<0) s->qscale+= 52;
5915             else            s->qscale-= 52;
5916         }
5917         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5918         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5919
5920         if( IS_INTRA16x16( mb_type ) ) {
5921             int i;
5922             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5923             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5924
5925             if( cbp&15 ) {
5926                 qmul = h->dequant4_coeff[0][s->qscale];
5927                 for( i = 0; i < 16; i++ ) {
5928                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5929                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5930                 }
5931             } else {
5932                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5933             }
5934         } else {
5935             int i8x8, i4x4;
5936             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5937                 if( cbp & (1<<i8x8) ) {
5938                     if( IS_8x8DCT(mb_type) ) {
5939                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5940                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5941                     } else {
5942                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5943                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5944                             const int index = 4*i8x8 + i4x4;
5945                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5946 //START_TIMER
5947                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5948 //STOP_TIMER("decode_residual")
5949                         }
5950                     }
5951                 } else {
5952                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5953                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5954                 }
5955             }
5956         }
5957
5958         if( cbp&0x30 ){
5959             int c;
5960             for( c = 0; c < 2; c++ ) {
5961                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5962                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5963             }
5964         }
5965
5966         if( cbp&0x20 ) {
5967             int c, i;
5968             for( c = 0; c < 2; c++ ) {
5969                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5970                 for( i = 0; i < 4; i++ ) {
5971                     const int index = 16 + 4 * c + i;
5972                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5973                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5974                 }
5975             }
5976         } else {
5977             uint8_t * const nnz= &h->non_zero_count_cache[0];
5978             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5979             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5980         }
5981     } else {
5982         uint8_t * const nnz= &h->non_zero_count_cache[0];
5983         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5984         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5985         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5986         h->last_qscale_diff = 0;
5987     }
5988
5989     s->current_picture.qscale_table[mb_xy]= s->qscale;
5990     write_back_non_zero_count(h);
5991
5992     if(MB_MBAFF){
5993         h->ref_count[0] >>= 1;
5994         h->ref_count[1] >>= 1;
5995     }
5996
5997     return 0;
5998 }
5999
6000
6001 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6002     const int index_a = qp + h->slice_alpha_c0_offset;
6003     const int alpha = (alpha_table+52)[index_a];
6004     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6005
6006     if( bS[0] < 4 ) {
6007         int8_t tc[4];
6008         tc[0] = (tc0_table+52)[index_a][bS[0]];
6009         tc[1] = (tc0_table+52)[index_a][bS[1]];
6010         tc[2] = (tc0_table+52)[index_a][bS[2]];
6011         tc[3] = (tc0_table+52)[index_a][bS[3]];
6012         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6013     } else {
6014         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
6015     }
6016 }
6017 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6018     const int index_a = qp + h->slice_alpha_c0_offset;
6019     const int alpha = (alpha_table+52)[index_a];
6020     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6021
6022     if( bS[0] < 4 ) {
6023         int8_t tc[4];
6024         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6025         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6026         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6027         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6028         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6029     } else {
6030         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6031     }
6032 }
6033
6034 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6035     int i;
6036     for( i = 0; i < 16; i++, pix += stride) {
6037         int index_a;
6038         int alpha;
6039         int beta;
6040
6041         int qp_index;
6042         int bS_index = (i >> 1);
6043         if (!MB_FIELD) {
6044             bS_index &= ~1;
6045             bS_index |= (i & 1);
6046         }
6047
6048         if( bS[bS_index] == 0 ) {
6049             continue;
6050         }
6051
6052         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6053         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6054         alpha = (alpha_table+52)[index_a];
6055         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6056
6057         if( bS[bS_index] < 4 ) {
6058             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6059             const int p0 = pix[-1];
6060             const int p1 = pix[-2];
6061             const int p2 = pix[-3];
6062             const int q0 = pix[0];
6063             const int q1 = pix[1];
6064             const int q2 = pix[2];
6065
6066             if( FFABS( p0 - q0 ) < alpha &&
6067                 FFABS( p1 - p0 ) < beta &&
6068                 FFABS( q1 - q0 ) < beta ) {
6069                 int tc = tc0;
6070                 int i_delta;
6071
6072                 if( FFABS( p2 - p0 ) < beta ) {
6073                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6074                     tc++;
6075                 }
6076                 if( FFABS( q2 - q0 ) < beta ) {
6077                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6078                     tc++;
6079                 }
6080
6081                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6082                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6083                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6084                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6085             }
6086         }else{
6087             const int p0 = pix[-1];
6088             const int p1 = pix[-2];
6089             const int p2 = pix[-3];
6090
6091             const int q0 = pix[0];
6092             const int q1 = pix[1];
6093             const int q2 = pix[2];
6094
6095             if( FFABS( p0 - q0 ) < alpha &&
6096                 FFABS( p1 - p0 ) < beta &&
6097                 FFABS( q1 - q0 ) < beta ) {
6098
6099                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6100                     if( FFABS( p2 - p0 ) < beta)
6101                     {
6102                         const int p3 = pix[-4];
6103                         /* p0', p1', p2' */
6104                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6105                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6106                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6107                     } else {
6108                         /* p0' */
6109                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6110                     }
6111                     if( FFABS( q2 - q0 ) < beta)
6112                     {
6113                         const int q3 = pix[3];
6114                         /* q0', q1', q2' */
6115                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6116                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6117                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6118                     } else {
6119                         /* q0' */
6120                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6121                     }
6122                 }else{
6123                     /* p0', q0' */
6124                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6125                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6126                 }
6127                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6128             }
6129         }
6130     }
6131 }
6132 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6133     int i;
6134     for( i = 0; i < 8; i++, pix += stride) {
6135         int index_a;
6136         int alpha;
6137         int beta;
6138
6139         int qp_index;
6140         int bS_index = i;
6141
6142         if( bS[bS_index] == 0 ) {
6143             continue;
6144         }
6145
6146         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6147         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6148         alpha = (alpha_table+52)[index_a];
6149         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6150
6151         if( bS[bS_index] < 4 ) {
6152             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6153             const int p0 = pix[-1];
6154             const int p1 = pix[-2];
6155             const int q0 = pix[0];
6156             const int q1 = pix[1];
6157
6158             if( FFABS( p0 - q0 ) < alpha &&
6159                 FFABS( p1 - p0 ) < beta &&
6160                 FFABS( q1 - q0 ) < beta ) {
6161                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6162
6163                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6164                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6165                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6166             }
6167         }else{
6168             const int p0 = pix[-1];
6169             const int p1 = pix[-2];
6170             const int q0 = pix[0];
6171             const int q1 = pix[1];
6172
6173             if( FFABS( p0 - q0 ) < alpha &&
6174                 FFABS( p1 - p0 ) < beta &&
6175                 FFABS( q1 - q0 ) < beta ) {
6176
6177                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6178                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6179                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6180             }
6181         }
6182     }
6183 }
6184
6185 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6186     const int index_a = qp + h->slice_alpha_c0_offset;
6187     const int alpha = (alpha_table+52)[index_a];
6188     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6189
6190     if( bS[0] < 4 ) {
6191         int8_t tc[4];
6192         tc[0] = (tc0_table+52)[index_a][bS[0]];
6193         tc[1] = (tc0_table+52)[index_a][bS[1]];
6194         tc[2] = (tc0_table+52)[index_a][bS[2]];
6195         tc[3] = (tc0_table+52)[index_a][bS[3]];
6196         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6197     } else {
6198         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6199     }
6200 }
6201
6202 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6203     const int index_a = qp + h->slice_alpha_c0_offset;
6204     const int alpha = (alpha_table+52)[index_a];
6205     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6206
6207     if( bS[0] < 4 ) {
6208         int8_t tc[4];
6209         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6210         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6211         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6212         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6213         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6214     } else {
6215         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6216     }
6217 }
6218
6219 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6220     MpegEncContext * const s = &h->s;
6221     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6222     int mb_xy, mb_type;
6223     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6224
6225     mb_xy = h->mb_xy;
6226
6227     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6228         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6229        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6230                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6231         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6232         return;
6233     }
6234     assert(!FRAME_MBAFF);
6235
6236     mb_type = s->current_picture.mb_type[mb_xy];
6237     qp = s->current_picture.qscale_table[mb_xy];
6238     qp0 = s->current_picture.qscale_table[mb_xy-1];
6239     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6240     qpc = get_chroma_qp( h, 0, qp );
6241     qpc0 = get_chroma_qp( h, 0, qp0 );
6242     qpc1 = get_chroma_qp( h, 0, qp1 );
6243     qp0 = (qp + qp0 + 1) >> 1;
6244     qp1 = (qp + qp1 + 1) >> 1;
6245     qpc0 = (qpc + qpc0 + 1) >> 1;
6246     qpc1 = (qpc + qpc1 + 1) >> 1;
6247     qp_thresh = 15 - h->slice_alpha_c0_offset;
6248     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6249        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6250         return;
6251
6252     if( IS_INTRA(mb_type) ) {
6253         int16_t bS4[4] = {4,4,4,4};
6254         int16_t bS3[4] = {3,3,3,3};
6255         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6256         if( IS_8x8DCT(mb_type) ) {
6257             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6258             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6259             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6260             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6261         } else {
6262             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6263             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6264             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6265             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6266             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6267             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6268             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6269             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6270         }
6271         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6272         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6273         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6274         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6275         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6276         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6277         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6278         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6279         return;
6280     } else {
6281         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6282         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6283         int edges;
6284         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6285             edges = 4;
6286             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6287         } else {
6288             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6289                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6290             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6291                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6292                              ? 3 : 0;
6293             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6294             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6295             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6296                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6297         }
6298         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6299             bSv[0][0] = 0x0004000400040004ULL;
6300         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6301             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6302
6303 #define FILTER(hv,dir,edge)\
6304         if(bSv[dir][edge]) {\
6305             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6306             if(!(edge&1)) {\
6307                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6308                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6309             }\
6310         }
6311         if( edges == 1 ) {
6312             FILTER(v,0,0);
6313             FILTER(h,1,0);
6314         } else if( IS_8x8DCT(mb_type) ) {
6315             FILTER(v,0,0);
6316             FILTER(v,0,2);
6317             FILTER(h,1,0);
6318             FILTER(h,1,2);
6319         } else {
6320             FILTER(v,0,0);
6321             FILTER(v,0,1);
6322             FILTER(v,0,2);
6323             FILTER(v,0,3);
6324             FILTER(h,1,0);
6325             FILTER(h,1,1);
6326             FILTER(h,1,2);
6327             FILTER(h,1,3);
6328         }
6329 #undef FILTER
6330     }
6331 }
6332
6333
6334 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6335     MpegEncContext * const s = &h->s;
6336     int edge;
6337     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6338     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6339     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6340     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6341     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6342
6343     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6344                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6345     // how often to recheck mv-based bS when iterating between edges
6346     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6347                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6348     // how often to recheck mv-based bS when iterating along each edge
6349     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6350
6351     if (first_vertical_edge_done) {
6352         start = 1;
6353     }
6354
6355     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6356         start = 1;
6357
6358     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6359         && !IS_INTERLACED(mb_type)
6360         && IS_INTERLACED(mbm_type)
6361         ) {
6362         // This is a special case in the norm where the filtering must
6363         // be done twice (one each of the field) even if we are in a
6364         // frame macroblock.
6365         //
6366         static const int nnz_idx[4] = {4,5,6,3};
6367         unsigned int tmp_linesize   = 2 *   linesize;
6368         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6369         int mbn_xy = mb_xy - 2 * s->mb_stride;
6370         int qp;
6371         int i, j;
6372         int16_t bS[4];
6373
6374         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6375             if( IS_INTRA(mb_type) ||
6376                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6377                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6378             } else {
6379                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6380                 for( i = 0; i < 4; i++ ) {
6381                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6382                         mbn_nnz[nnz_idx[i]] != 0 )
6383                         bS[i] = 2;
6384                     else
6385                         bS[i] = 1;
6386                 }
6387             }
6388             // Do not use s->qscale as luma quantizer because it has not the same
6389             // value in IPCM macroblocks.
6390             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6391             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6392             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6393             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6394             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6395                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6396             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6397                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6398         }
6399
6400         start = 1;
6401     }
6402
6403     /* Calculate bS */
6404     for( edge = start; edge < edges; edge++ ) {
6405         /* mbn_xy: neighbor macroblock */
6406         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6407         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6408         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6409         int16_t bS[4];
6410         int qp;
6411
6412         if( (edge&1) && IS_8x8DCT(mb_type) )
6413             continue;
6414
6415         if( IS_INTRA(mb_type) ||
6416             IS_INTRA(mbn_type) ) {
6417             int value;
6418             if (edge == 0) {
6419                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6420                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6421                 ) {
6422                     value = 4;
6423                 } else {
6424                     value = 3;
6425                 }
6426             } else {
6427                 value = 3;
6428             }
6429             bS[0] = bS[1] = bS[2] = bS[3] = value;
6430         } else {
6431             int i, l;
6432             int mv_done;
6433
6434             if( edge & mask_edge ) {
6435                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6436                 mv_done = 1;
6437             }
6438             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6439                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6440                 mv_done = 1;
6441             }
6442             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6443                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6444                 int bn_idx= b_idx - (dir ? 8:1);
6445                 int v = 0;
6446
6447                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6448                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6449                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6450                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6451                 }
6452
6453                 if(h->slice_type_nos == FF_B_TYPE && v){
6454                     v=0;
6455                     for( l = 0; !v && l < 2; l++ ) {
6456                         int ln= 1-l;
6457                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6458                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6459                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6460                     }
6461                 }
6462
6463                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6464                 mv_done = 1;
6465             }
6466             else
6467                 mv_done = 0;
6468
6469             for( i = 0; i < 4; i++ ) {
6470                 int x = dir == 0 ? edge : i;
6471                 int y = dir == 0 ? i    : edge;
6472                 int b_idx= 8 + 4 + x + 8*y;
6473                 int bn_idx= b_idx - (dir ? 8:1);
6474
6475                 if( h->non_zero_count_cache[b_idx] |
6476                     h->non_zero_count_cache[bn_idx] ) {
6477                     bS[i] = 2;
6478                 }
6479                 else if(!mv_done)
6480                 {
6481                     bS[i] = 0;
6482                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6483                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6484                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6485                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6486                             bS[i] = 1;
6487                             break;
6488                         }
6489                     }
6490
6491                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6492                         bS[i] = 0;
6493                         for( l = 0; l < 2; l++ ) {
6494                             int ln= 1-l;
6495                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6496                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6497                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6498                                 bS[i] = 1;
6499                                 break;
6500                             }
6501                         }
6502                     }
6503                 }
6504             }
6505
6506             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6507                 continue;
6508         }
6509
6510         /* Filter edge */
6511         // Do not use s->qscale as luma quantizer because it has not the same
6512         // value in IPCM macroblocks.
6513         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6514         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6515         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6516         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6517         if( dir == 0 ) {
6518             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6519             if( (edge&1) == 0 ) {
6520                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6521                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6522                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6523                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6524             }
6525         } else {
6526             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6527             if( (edge&1) == 0 ) {
6528                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6529                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6530                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6531                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6532             }
6533         }
6534     }
6535 }
6536
6537 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6538     MpegEncContext * const s = &h->s;
6539     const int mb_xy= mb_x + mb_y*s->mb_stride;
6540     const int mb_type = s->current_picture.mb_type[mb_xy];
6541     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6542     int first_vertical_edge_done = 0;
6543     av_unused int dir;
6544
6545     //for sufficiently low qp, filtering wouldn't do anything
6546     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6547     if(!FRAME_MBAFF){
6548         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6549         int qp = s->current_picture.qscale_table[mb_xy];
6550         if(qp <= qp_thresh
6551            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6552            && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6553             return;
6554         }
6555     }
6556
6557     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6558     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6559         int top_type, left_type[2];
6560         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6561         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6562         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6563
6564         if(IS_8x8DCT(top_type)){
6565             h->non_zero_count_cache[4+8*0]=
6566             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6567             h->non_zero_count_cache[6+8*0]=
6568             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6569         }
6570         if(IS_8x8DCT(left_type[0])){
6571             h->non_zero_count_cache[3+8*1]=
6572             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6573         }
6574         if(IS_8x8DCT(left_type[1])){
6575             h->non_zero_count_cache[3+8*3]=
6576             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6577         }
6578
6579         if(IS_8x8DCT(mb_type)){
6580             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6581             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6582
6583             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6584             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6585
6586             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6587             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6588
6589             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6590             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6591         }
6592     }
6593
6594     if (FRAME_MBAFF
6595             // left mb is in picture
6596             && h->slice_table[mb_xy-1] != 0xFFFF
6597             // and current and left pair do not have the same interlaced type
6598             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6599             // and left mb is in the same slice if deblocking_filter == 2
6600             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6601         /* First vertical edge is different in MBAFF frames
6602          * There are 8 different bS to compute and 2 different Qp
6603          */
6604         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6605         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6606         int16_t bS[8];
6607         int qp[2];
6608         int bqp[2];
6609         int rqp[2];
6610         int mb_qp, mbn0_qp, mbn1_qp;
6611         int i;
6612         first_vertical_edge_done = 1;
6613
6614         if( IS_INTRA(mb_type) )
6615             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6616         else {
6617             for( i = 0; i < 8; i++ ) {
6618                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6619
6620                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6621                     bS[i] = 4;
6622                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6623                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6624                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6625                                                                        :
6626                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6627                     bS[i] = 2;
6628                 else
6629                     bS[i] = 1;
6630             }
6631         }
6632
6633         mb_qp = s->current_picture.qscale_table[mb_xy];
6634         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6635         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6636         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6637         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6638                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6639         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6640                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6641         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6642         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6643                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6644         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6645                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6646
6647         /* Filter edge */
6648         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6649         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6650         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6651         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6652         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6653     }
6654
6655 #if CONFIG_SMALL
6656     for( dir = 0; dir < 2; dir++ )
6657         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6658 #else
6659     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6660     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6661 #endif
6662 }
6663
6664 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6665     H264Context *h = *(void**)arg;
6666     MpegEncContext * const s = &h->s;
6667     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6668
6669     s->mb_skip_run= -1;
6670
6671     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6672                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6673
6674     if( h->pps.cabac ) {
6675         int i;
6676
6677         /* realign */
6678         align_get_bits( &s->gb );
6679
6680         /* init cabac */
6681         ff_init_cabac_states( &h->cabac);
6682         ff_init_cabac_decoder( &h->cabac,
6683                                s->gb.buffer + get_bits_count(&s->gb)/8,
6684                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6685         /* calculate pre-state */
6686         for( i= 0; i < 460; i++ ) {
6687             int pre;
6688             if( h->slice_type_nos == FF_I_TYPE )
6689                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6690             else
6691                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6692
6693             if( pre <= 63 )
6694                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6695             else
6696                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6697         }
6698
6699         for(;;){
6700 //START_TIMER
6701             int ret = decode_mb_cabac(h);
6702             int eos;
6703 //STOP_TIMER("decode_mb_cabac")
6704
6705             if(ret>=0) hl_decode_mb(h);
6706
6707             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6708                 s->mb_y++;
6709
6710                 ret = decode_mb_cabac(h);
6711
6712                 if(ret>=0) hl_decode_mb(h);
6713                 s->mb_y--;
6714             }
6715             eos = get_cabac_terminate( &h->cabac );
6716
6717             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6718                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6719                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6720                 return -1;
6721             }
6722
6723             if( ++s->mb_x >= s->mb_width ) {
6724                 s->mb_x = 0;
6725                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6726                 ++s->mb_y;
6727                 if(FIELD_OR_MBAFF_PICTURE) {
6728                     ++s->mb_y;
6729                 }
6730             }
6731
6732             if( eos || s->mb_y >= s->mb_height ) {
6733                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6734                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6735                 return 0;
6736             }
6737         }
6738
6739     } else {
6740         for(;;){
6741             int ret = decode_mb_cavlc(h);
6742
6743             if(ret>=0) hl_decode_mb(h);
6744
6745             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6746                 s->mb_y++;
6747                 ret = decode_mb_cavlc(h);
6748
6749                 if(ret>=0) hl_decode_mb(h);
6750                 s->mb_y--;
6751             }
6752
6753             if(ret<0){
6754                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6755                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6756
6757                 return -1;
6758             }
6759
6760             if(++s->mb_x >= s->mb_width){
6761                 s->mb_x=0;
6762                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6763                 ++s->mb_y;
6764                 if(FIELD_OR_MBAFF_PICTURE) {
6765                     ++s->mb_y;
6766                 }
6767                 if(s->mb_y >= s->mb_height){
6768                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6769
6770                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6771                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6772
6773                         return 0;
6774                     }else{
6775                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6776
6777                         return -1;
6778                     }
6779                 }
6780             }
6781
6782             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6783                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6784                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6785                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6786
6787                     return 0;
6788                 }else{
6789                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6790
6791                     return -1;
6792                 }
6793             }
6794         }
6795     }
6796
6797 #if 0
6798     for(;s->mb_y < s->mb_height; s->mb_y++){
6799         for(;s->mb_x < s->mb_width; s->mb_x++){
6800             int ret= decode_mb(h);
6801
6802             hl_decode_mb(h);
6803
6804             if(ret<0){
6805                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6806                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6807
6808                 return -1;
6809             }
6810
6811             if(++s->mb_x >= s->mb_width){
6812                 s->mb_x=0;
6813                 if(++s->mb_y >= s->mb_height){
6814                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6815                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6816
6817                         return 0;
6818                     }else{
6819                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6820
6821                         return -1;
6822                     }
6823                 }
6824             }
6825
6826             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6827                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6828                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6829
6830                     return 0;
6831                 }else{
6832                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6833
6834                     return -1;
6835                 }
6836             }
6837         }
6838         s->mb_x=0;
6839         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6840     }
6841 #endif
6842     return -1; //not reached
6843 }
6844
6845 static int decode_picture_timing(H264Context *h){
6846     MpegEncContext * const s = &h->s;
6847     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6848         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6849         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6850     }
6851     if(h->sps.pic_struct_present_flag){
6852         unsigned int i, num_clock_ts;
6853         h->sei_pic_struct = get_bits(&s->gb, 4);
6854         h->sei_ct_type    = 0;
6855
6856         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6857             return -1;
6858
6859         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6860
6861         for (i = 0 ; i < num_clock_ts ; i++){
6862             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6863                 unsigned int full_timestamp_flag;
6864                 h->sei_ct_type |= 1<<get_bits(&s->gb, 2);
6865                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6866                 skip_bits(&s->gb, 5);                 /* counting_type */
6867                 full_timestamp_flag = get_bits(&s->gb, 1);
6868                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6869                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6870                 skip_bits(&s->gb, 8);                 /* n_frames */
6871                 if(full_timestamp_flag){
6872                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6873                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6874                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6875                 }else{
6876                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6877                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6878                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6879                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6880                             if(get_bits(&s->gb, 1))   /* hours_flag */
6881                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6882                         }
6883                     }
6884                 }
6885                 if(h->sps.time_offset_length > 0)
6886                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6887             }
6888         }
6889     }
6890     return 0;
6891 }
6892
6893 static int decode_unregistered_user_data(H264Context *h, int size){
6894     MpegEncContext * const s = &h->s;
6895     uint8_t user_data[16+256];
6896     int e, build, i;
6897
6898     if(size<16)
6899         return -1;
6900
6901     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6902         user_data[i]= get_bits(&s->gb, 8);
6903     }
6904
6905     user_data[i]= 0;
6906     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6907     if(e==1 && build>=0)
6908         h->x264_build= build;
6909
6910     if(s->avctx->debug & FF_DEBUG_BUGS)
6911         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6912
6913     for(; i<size; i++)
6914         skip_bits(&s->gb, 8);
6915
6916     return 0;
6917 }
6918
6919 static int decode_recovery_point(H264Context *h){
6920     MpegEncContext * const s = &h->s;
6921
6922     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6923     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6924
6925     return 0;
6926 }
6927
6928 static int decode_buffering_period(H264Context *h){
6929     MpegEncContext * const s = &h->s;
6930     unsigned int sps_id;
6931     int sched_sel_idx;
6932     SPS *sps;
6933
6934     sps_id = get_ue_golomb_31(&s->gb);
6935     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6936         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6937         return -1;
6938     }
6939     sps = h->sps_buffers[sps_id];
6940
6941     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6942     if (sps->nal_hrd_parameters_present_flag) {
6943         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6944             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6945             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6946         }
6947     }
6948     if (sps->vcl_hrd_parameters_present_flag) {
6949         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6950             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6951             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6952         }
6953     }
6954
6955     h->sei_buffering_period_present = 1;
6956     return 0;
6957 }
6958
6959 int ff_h264_decode_sei(H264Context *h){
6960     MpegEncContext * const s = &h->s;
6961
6962     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6963         int size, type;
6964
6965         type=0;
6966         do{
6967             type+= show_bits(&s->gb, 8);
6968         }while(get_bits(&s->gb, 8) == 255);
6969
6970         size=0;
6971         do{
6972             size+= show_bits(&s->gb, 8);
6973         }while(get_bits(&s->gb, 8) == 255);
6974
6975         switch(type){
6976         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6977             if(decode_picture_timing(h) < 0)
6978                 return -1;
6979             break;
6980         case SEI_TYPE_USER_DATA_UNREGISTERED:
6981             if(decode_unregistered_user_data(h, size) < 0)
6982                 return -1;
6983             break;
6984         case SEI_TYPE_RECOVERY_POINT:
6985             if(decode_recovery_point(h) < 0)
6986                 return -1;
6987             break;
6988         case SEI_BUFFERING_PERIOD:
6989             if(decode_buffering_period(h) < 0)
6990                 return -1;
6991             break;
6992         default:
6993             skip_bits(&s->gb, 8*size);
6994         }
6995
6996         //FIXME check bits here
6997         align_get_bits(&s->gb);
6998     }
6999
7000     return 0;
7001 }
7002
7003 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
7004     MpegEncContext * const s = &h->s;
7005     int cpb_count, i;
7006     cpb_count = get_ue_golomb_31(&s->gb) + 1;
7007
7008     if(cpb_count > 32U){
7009         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
7010         return -1;
7011     }
7012
7013     get_bits(&s->gb, 4); /* bit_rate_scale */
7014     get_bits(&s->gb, 4); /* cpb_size_scale */
7015     for(i=0; i<cpb_count; i++){
7016         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7017         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7018         get_bits1(&s->gb);     /* cbr_flag */
7019     }
7020     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7021     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7022     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
7023     sps->time_offset_length = get_bits(&s->gb, 5);
7024     sps->cpb_cnt = cpb_count;
7025     return 0;
7026 }
7027
7028 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7029     MpegEncContext * const s = &h->s;
7030     int aspect_ratio_info_present_flag;
7031     unsigned int aspect_ratio_idc;
7032
7033     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7034
7035     if( aspect_ratio_info_present_flag ) {
7036         aspect_ratio_idc= get_bits(&s->gb, 8);
7037         if( aspect_ratio_idc == EXTENDED_SAR ) {
7038             sps->sar.num= get_bits(&s->gb, 16);
7039             sps->sar.den= get_bits(&s->gb, 16);
7040         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
7041             sps->sar=  pixel_aspect[aspect_ratio_idc];
7042         }else{
7043             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7044             return -1;
7045         }
7046     }else{
7047         sps->sar.num=
7048         sps->sar.den= 0;
7049     }
7050 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7051
7052     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7053         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7054     }
7055
7056     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7057         get_bits(&s->gb, 3);    /* video_format */
7058         get_bits1(&s->gb);      /* video_full_range_flag */
7059         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7060             get_bits(&s->gb, 8); /* colour_primaries */
7061             get_bits(&s->gb, 8); /* transfer_characteristics */
7062             get_bits(&s->gb, 8); /* matrix_coefficients */
7063         }
7064     }
7065
7066     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7067         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7068         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7069     }
7070
7071     sps->timing_info_present_flag = get_bits1(&s->gb);
7072     if(sps->timing_info_present_flag){
7073         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7074         sps->time_scale = get_bits_long(&s->gb, 32);
7075         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7076     }
7077
7078     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7079     if(sps->nal_hrd_parameters_present_flag)
7080         if(decode_hrd_parameters(h, sps) < 0)
7081             return -1;
7082     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7083     if(sps->vcl_hrd_parameters_present_flag)
7084         if(decode_hrd_parameters(h, sps) < 0)
7085             return -1;
7086     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7087         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7088     sps->pic_struct_present_flag = get_bits1(&s->gb);
7089
7090     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7091     if(sps->bitstream_restriction_flag){
7092         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7093         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7094         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7095         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7096         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7097         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7098         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7099
7100         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7101             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7102             return -1;
7103         }
7104     }
7105
7106     return 0;
7107 }
7108
7109 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7110                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7111     MpegEncContext * const s = &h->s;
7112     int i, last = 8, next = 8;
7113     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7114     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7115         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7116     else
7117     for(i=0;i<size;i++){
7118         if(next)
7119             next = (last + get_se_golomb(&s->gb)) & 0xff;
7120         if(!i && !next){ /* matrix not written, we use the preset one */
7121             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7122             break;
7123         }
7124         last = factors[scan[i]] = next ? next : last;
7125     }
7126 }
7127
7128 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7129                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7130     MpegEncContext * const s = &h->s;
7131     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7132     const uint8_t *fallback[4] = {
7133         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7134         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7135         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7136         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7137     };
7138     if(get_bits1(&s->gb)){
7139         sps->scaling_matrix_present |= is_sps;
7140         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7141         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7142         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7143         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7144         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7145         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7146         if(is_sps || pps->transform_8x8_mode){
7147             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7148             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7149         }
7150     }
7151 }
7152
7153 int ff_h264_decode_seq_parameter_set(H264Context *h){
7154     MpegEncContext * const s = &h->s;
7155     int profile_idc, level_idc;
7156     unsigned int sps_id;
7157     int i;
7158     SPS *sps;
7159
7160     profile_idc= get_bits(&s->gb, 8);
7161     get_bits1(&s->gb);   //constraint_set0_flag
7162     get_bits1(&s->gb);   //constraint_set1_flag
7163     get_bits1(&s->gb);   //constraint_set2_flag
7164     get_bits1(&s->gb);   //constraint_set3_flag
7165     get_bits(&s->gb, 4); // reserved
7166     level_idc= get_bits(&s->gb, 8);
7167     sps_id= get_ue_golomb_31(&s->gb);
7168
7169     if(sps_id >= MAX_SPS_COUNT) {
7170         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7171         return -1;
7172     }
7173     sps= av_mallocz(sizeof(SPS));
7174     if(sps == NULL)
7175         return -1;
7176
7177     sps->profile_idc= profile_idc;
7178     sps->level_idc= level_idc;
7179
7180     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7181     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7182     sps->scaling_matrix_present = 0;
7183
7184     if(sps->profile_idc >= 100){ //high profile
7185         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7186         if(sps->chroma_format_idc == 3)
7187             sps->residual_color_transform_flag = get_bits1(&s->gb);
7188         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7189         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7190         sps->transform_bypass = get_bits1(&s->gb);
7191         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7192     }else{
7193         sps->chroma_format_idc= 1;
7194     }
7195
7196     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7197     sps->poc_type= get_ue_golomb_31(&s->gb);
7198
7199     if(sps->poc_type == 0){ //FIXME #define
7200         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7201     } else if(sps->poc_type == 1){//FIXME #define
7202         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7203         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7204         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7205         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7206
7207         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7208             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7209             goto fail;
7210         }
7211
7212         for(i=0; i<sps->poc_cycle_length; i++)
7213             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7214     }else if(sps->poc_type != 2){
7215         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7216         goto fail;
7217     }
7218
7219     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7220     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7221         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7222         goto fail;
7223     }
7224     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7225     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7226     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7227     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7228        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7229         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7230         goto fail;
7231     }
7232
7233     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7234     if(!sps->frame_mbs_only_flag)
7235         sps->mb_aff= get_bits1(&s->gb);
7236     else
7237         sps->mb_aff= 0;
7238
7239     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7240
7241 #ifndef ALLOW_INTERLACE
7242     if(sps->mb_aff)
7243         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7244 #endif
7245     sps->crop= get_bits1(&s->gb);
7246     if(sps->crop){
7247         sps->crop_left  = get_ue_golomb(&s->gb);
7248         sps->crop_right = get_ue_golomb(&s->gb);
7249         sps->crop_top   = get_ue_golomb(&s->gb);
7250         sps->crop_bottom= get_ue_golomb(&s->gb);
7251         if(sps->crop_left || sps->crop_top){
7252             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7253         }
7254         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7255             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7256         }
7257     }else{
7258         sps->crop_left  =
7259         sps->crop_right =
7260         sps->crop_top   =
7261         sps->crop_bottom= 0;
7262     }
7263
7264     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7265     if( sps->vui_parameters_present_flag )
7266         decode_vui_parameters(h, sps);
7267
7268     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7269         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n",
7270                sps_id, sps->profile_idc, sps->level_idc,
7271                sps->poc_type,
7272                sps->ref_frame_count,
7273                sps->mb_width, sps->mb_height,
7274                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7275                sps->direct_8x8_inference_flag ? "8B8" : "",
7276                sps->crop_left, sps->crop_right,
7277                sps->crop_top, sps->crop_bottom,
7278                sps->vui_parameters_present_flag ? "VUI" : "",
7279                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc],
7280                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
7281                sps->timing_info_present_flag ? sps->time_scale : 0
7282                );
7283     }
7284
7285     av_free(h->sps_buffers[sps_id]);
7286     h->sps_buffers[sps_id]= sps;
7287     h->sps = *sps;
7288     return 0;
7289 fail:
7290     av_free(sps);
7291     return -1;
7292 }
7293
7294 static void
7295 build_qp_table(PPS *pps, int t, int index)
7296 {
7297     int i;
7298     for(i = 0; i < 52; i++)
7299         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7300 }
7301
7302 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7303     MpegEncContext * const s = &h->s;
7304     unsigned int pps_id= get_ue_golomb(&s->gb);
7305     PPS *pps;
7306
7307     if(pps_id >= MAX_PPS_COUNT) {
7308         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7309         return -1;
7310     }
7311
7312     pps= av_mallocz(sizeof(PPS));
7313     if(pps == NULL)
7314         return -1;
7315     pps->sps_id= get_ue_golomb_31(&s->gb);
7316     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7317         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7318         goto fail;
7319     }
7320
7321     pps->cabac= get_bits1(&s->gb);
7322     pps->pic_order_present= get_bits1(&s->gb);
7323     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7324     if(pps->slice_group_count > 1 ){
7325         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7326         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7327         switch(pps->mb_slice_group_map_type){
7328         case 0:
7329 #if 0
7330 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7331 |    run_length[ i ]                                |1  |ue(v)   |
7332 #endif
7333             break;
7334         case 2:
7335 #if 0
7336 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7337 |{                                                  |   |        |
7338 |    top_left_mb[ i ]                               |1  |ue(v)   |
7339 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7340 |   }                                               |   |        |
7341 #endif
7342             break;
7343         case 3:
7344         case 4:
7345         case 5:
7346 #if 0
7347 |   slice_group_change_direction_flag               |1  |u(1)    |
7348 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7349 #endif
7350             break;
7351         case 6:
7352 #if 0
7353 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7354 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7355 |)                                                  |   |        |
7356 |    slice_group_id[ i ]                            |1  |u(v)    |
7357 #endif
7358             break;
7359         }
7360     }
7361     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7362     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7363     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7364         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7365         goto fail;
7366     }
7367
7368     pps->weighted_pred= get_bits1(&s->gb);
7369     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7370     pps->init_qp= get_se_golomb(&s->gb) + 26;
7371     pps->init_qs= get_se_golomb(&s->gb) + 26;
7372     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7373     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7374     pps->constrained_intra_pred= get_bits1(&s->gb);
7375     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7376
7377     pps->transform_8x8_mode= 0;
7378     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7379     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7380     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7381
7382     if(get_bits_count(&s->gb) < bit_length){
7383         pps->transform_8x8_mode= get_bits1(&s->gb);
7384         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7385         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7386     } else {
7387         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7388     }
7389
7390     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7391     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7392     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7393         h->pps.chroma_qp_diff= 1;
7394
7395     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7396         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7397                pps_id, pps->sps_id,
7398                pps->cabac ? "CABAC" : "CAVLC",
7399                pps->slice_group_count,
7400                pps->ref_count[0], pps->ref_count[1],
7401                pps->weighted_pred ? "weighted" : "",
7402                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7403                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7404                pps->constrained_intra_pred ? "CONSTR" : "",
7405                pps->redundant_pic_cnt_present ? "REDU" : "",
7406                pps->transform_8x8_mode ? "8x8DCT" : ""
7407                );
7408     }
7409
7410     av_free(h->pps_buffers[pps_id]);
7411     h->pps_buffers[pps_id]= pps;
7412     return 0;
7413 fail:
7414     av_free(pps);
7415     return -1;
7416 }
7417
7418 /**
7419  * Call decode_slice() for each context.
7420  *
7421  * @param h h264 master context
7422  * @param context_count number of contexts to execute
7423  */
7424 static void execute_decode_slices(H264Context *h, int context_count){
7425     MpegEncContext * const s = &h->s;
7426     AVCodecContext * const avctx= s->avctx;
7427     H264Context *hx;
7428     int i;
7429
7430     if (s->avctx->hwaccel)
7431         return;
7432     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7433         return;
7434     if(context_count == 1) {
7435         decode_slice(avctx, &h);
7436     } else {
7437         for(i = 1; i < context_count; i++) {
7438             hx = h->thread_context[i];
7439             hx->s.error_recognition = avctx->error_recognition;
7440             hx->s.error_count = 0;
7441         }
7442
7443         avctx->execute(avctx, (void *)decode_slice,
7444                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7445
7446         /* pull back stuff from slices to master context */
7447         hx = h->thread_context[context_count - 1];
7448         s->mb_x = hx->s.mb_x;
7449         s->mb_y = hx->s.mb_y;
7450         s->dropable = hx->s.dropable;
7451         s->picture_structure = hx->s.picture_structure;
7452         for(i = 1; i < context_count; i++)
7453             h->s.error_count += h->thread_context[i]->s.error_count;
7454     }
7455 }
7456
7457
7458 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7459     MpegEncContext * const s = &h->s;
7460     AVCodecContext * const avctx= s->avctx;
7461     int buf_index=0;
7462     H264Context *hx; ///< thread context
7463     int context_count = 0;
7464     int next_avc= h->is_avc ? 0 : buf_size;
7465
7466     h->max_contexts = avctx->thread_count;
7467 #if 0
7468     int i;
7469     for(i=0; i<50; i++){
7470         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7471     }
7472 #endif
7473     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7474         h->current_slice = 0;
7475         if (!s->first_field)
7476             s->current_picture_ptr= NULL;
7477         reset_sei(h);
7478     }
7479
7480     for(;;){
7481         int consumed;
7482         int dst_length;
7483         int bit_length;
7484         const uint8_t *ptr;
7485         int i, nalsize = 0;
7486         int err;
7487
7488         if(buf_index >= next_avc) {
7489             if(buf_index >= buf_size) break;
7490             nalsize = 0;
7491             for(i = 0; i < h->nal_length_size; i++)
7492                 nalsize = (nalsize << 8) | buf[buf_index++];
7493             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7494                 if(nalsize == 1){
7495                     buf_index++;
7496                     continue;
7497                 }else{
7498                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7499                     break;
7500                 }
7501             }
7502             next_avc= buf_index + nalsize;
7503         } else {
7504             // start code prefix search
7505             for(; buf_index + 3 < buf_size; buf_index++){
7506                 // This should always succeed in the first iteration.
7507                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7508                     break;
7509             }
7510
7511             if(buf_index+3 >= buf_size) break;
7512
7513             buf_index+=3;
7514         }
7515
7516         hx = h->thread_context[context_count];
7517
7518         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, next_avc - buf_index);
7519         if (ptr==NULL || dst_length < 0){
7520             return -1;
7521         }
7522         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7523             dst_length--;
7524         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7525
7526         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7527             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7528         }
7529
7530         if (h->is_avc && (nalsize != consumed) && nalsize){
7531             int i, debug_level = AV_LOG_DEBUG;
7532             for (i = consumed; i < nalsize; i++)
7533                 if (buf[buf_index+i])
7534                     debug_level = AV_LOG_ERROR;
7535             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7536         }
7537
7538         buf_index += consumed;
7539
7540         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7541            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7542             continue;
7543
7544       again:
7545         err = 0;
7546         switch(hx->nal_unit_type){
7547         case NAL_IDR_SLICE:
7548             if (h->nal_unit_type != NAL_IDR_SLICE) {
7549                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7550                 return -1;
7551             }
7552             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7553         case NAL_SLICE:
7554             init_get_bits(&hx->s.gb, ptr, bit_length);
7555             hx->intra_gb_ptr=
7556             hx->inter_gb_ptr= &hx->s.gb;
7557             hx->s.data_partitioning = 0;
7558
7559             if((err = decode_slice_header(hx, h)))
7560                break;
7561
7562             if (s->avctx->hwaccel && h->current_slice == 1) {
7563                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7564                     return -1;
7565             }
7566
7567             s->current_picture_ptr->key_frame |=
7568                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7569                     (h->sei_recovery_frame_cnt >= 0);
7570             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7571                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7572                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7573                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7574                && avctx->skip_frame < AVDISCARD_ALL){
7575                 if(avctx->hwaccel) {
7576                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7577                         return -1;
7578                 }else
7579                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7580                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7581                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7582                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7583                 }else
7584                     context_count++;
7585             }
7586             break;
7587         case NAL_DPA:
7588             init_get_bits(&hx->s.gb, ptr, bit_length);
7589             hx->intra_gb_ptr=
7590             hx->inter_gb_ptr= NULL;
7591             hx->s.data_partitioning = 1;
7592
7593             err = decode_slice_header(hx, h);
7594             break;
7595         case NAL_DPB:
7596             init_get_bits(&hx->intra_gb, ptr, bit_length);
7597             hx->intra_gb_ptr= &hx->intra_gb;
7598             break;
7599         case NAL_DPC:
7600             init_get_bits(&hx->inter_gb, ptr, bit_length);
7601             hx->inter_gb_ptr= &hx->inter_gb;
7602
7603             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7604                && s->context_initialized
7605                && s->hurry_up < 5
7606                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7607                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7608                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7609                && avctx->skip_frame < AVDISCARD_ALL)
7610                 context_count++;
7611             break;
7612         case NAL_SEI:
7613             init_get_bits(&s->gb, ptr, bit_length);
7614             ff_h264_decode_sei(h);
7615             break;
7616         case NAL_SPS:
7617             init_get_bits(&s->gb, ptr, bit_length);
7618             ff_h264_decode_seq_parameter_set(h);
7619
7620             if(s->flags& CODEC_FLAG_LOW_DELAY)
7621                 s->low_delay=1;
7622
7623             if(avctx->has_b_frames < 2)
7624                 avctx->has_b_frames= !s->low_delay;
7625             break;
7626         case NAL_PPS:
7627             init_get_bits(&s->gb, ptr, bit_length);
7628
7629             ff_h264_decode_picture_parameter_set(h, bit_length);
7630
7631             break;
7632         case NAL_AUD:
7633         case NAL_END_SEQUENCE:
7634         case NAL_END_STREAM:
7635         case NAL_FILLER_DATA:
7636         case NAL_SPS_EXT:
7637         case NAL_AUXILIARY_SLICE:
7638             break;
7639         default:
7640             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7641         }
7642
7643         if(context_count == h->max_contexts) {
7644             execute_decode_slices(h, context_count);
7645             context_count = 0;
7646         }
7647
7648         if (err < 0)
7649             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7650         else if(err == 1) {
7651             /* Slice could not be decoded in parallel mode, copy down
7652              * NAL unit stuff to context 0 and restart. Note that
7653              * rbsp_buffer is not transferred, but since we no longer
7654              * run in parallel mode this should not be an issue. */
7655             h->nal_unit_type = hx->nal_unit_type;
7656             h->nal_ref_idc   = hx->nal_ref_idc;
7657             hx = h;
7658             goto again;
7659         }
7660     }
7661     if(context_count)
7662         execute_decode_slices(h, context_count);
7663     return buf_index;
7664 }
7665
7666 /**
7667  * returns the number of bytes consumed for building the current frame
7668  */
7669 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7670         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7671         if(pos+10>buf_size) pos=buf_size; // oops ;)
7672
7673         return pos;
7674 }
7675
7676 static int decode_frame(AVCodecContext *avctx,
7677                              void *data, int *data_size,
7678                              AVPacket *avpkt)
7679 {
7680     const uint8_t *buf = avpkt->data;
7681     int buf_size = avpkt->size;
7682     H264Context *h = avctx->priv_data;
7683     MpegEncContext *s = &h->s;
7684     AVFrame *pict = data;
7685     int buf_index;
7686
7687     s->flags= avctx->flags;
7688     s->flags2= avctx->flags2;
7689
7690    /* end of stream, output what is still in the buffers */
7691     if (buf_size == 0) {
7692         Picture *out;
7693         int i, out_idx;
7694
7695 //FIXME factorize this with the output code below
7696         out = h->delayed_pic[0];
7697         out_idx = 0;
7698         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7699             if(h->delayed_pic[i]->poc < out->poc){
7700                 out = h->delayed_pic[i];
7701                 out_idx = i;
7702             }
7703
7704         for(i=out_idx; h->delayed_pic[i]; i++)
7705             h->delayed_pic[i] = h->delayed_pic[i+1];
7706
7707         if(out){
7708             *data_size = sizeof(AVFrame);
7709             *pict= *(AVFrame*)out;
7710         }
7711
7712         return 0;
7713     }
7714
7715     if(h->is_avc && !h->got_avcC) {
7716         int i, cnt, nalsize;
7717         unsigned char *p = avctx->extradata;
7718         if(avctx->extradata_size < 7) {
7719             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7720             return -1;
7721         }
7722         if(*p != 1) {
7723             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7724             return -1;
7725         }
7726         /* sps and pps in the avcC always have length coded with 2 bytes,
7727            so put a fake nal_length_size = 2 while parsing them */
7728         h->nal_length_size = 2;
7729         // Decode sps from avcC
7730         cnt = *(p+5) & 0x1f; // Number of sps
7731         p += 6;
7732         for (i = 0; i < cnt; i++) {
7733             nalsize = AV_RB16(p) + 2;
7734             if(decode_nal_units(h, p, nalsize) < 0) {
7735                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7736                 return -1;
7737             }
7738             p += nalsize;
7739         }
7740         // Decode pps from avcC
7741         cnt = *(p++); // Number of pps
7742         for (i = 0; i < cnt; i++) {
7743             nalsize = AV_RB16(p) + 2;
7744             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7745                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7746                 return -1;
7747             }
7748             p += nalsize;
7749         }
7750         // Now store right nal length size, that will be use to parse all other nals
7751         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7752         // Do not reparse avcC
7753         h->got_avcC = 1;
7754     }
7755
7756     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7757         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7758             return -1;
7759         h->got_avcC = 1;
7760     }
7761
7762     buf_index=decode_nal_units(h, buf, buf_size);
7763     if(buf_index < 0)
7764         return -1;
7765
7766     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7767         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7768         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7769         return -1;
7770     }
7771
7772     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7773         Picture *out = s->current_picture_ptr;
7774         Picture *cur = s->current_picture_ptr;
7775         int i, pics, cross_idr, out_of_order, out_idx;
7776
7777         field_end(h);
7778
7779         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7780             /* Wait for second field. */
7781             *data_size = 0;
7782
7783         } else {
7784             cur->repeat_pict = 0;
7785
7786             /* Signal interlacing information externally. */
7787             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7788             if (h->sei_ct_type)
7789                 cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
7790             else
7791                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7792
7793             if(h->sps.pic_struct_present_flag){
7794                 switch (h->sei_pic_struct)
7795                 {
7796                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7797                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7798                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7799                     // From these hints, let the applications decide if they apply deinterlacing.
7800                     cur->repeat_pict = 1;
7801                     break;
7802                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7803                     // Force progressive here, as doubling interlaced frame is a bad idea.
7804                     cur->interlaced_frame = 0;
7805                     cur->repeat_pict = 2;
7806                     break;
7807                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7808                     cur->interlaced_frame = 0;
7809                     cur->repeat_pict = 4;
7810                     break;
7811                 }
7812             }else{
7813                 /* Derive interlacing flag from used decoding process. */
7814                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7815             }
7816
7817             if (cur->field_poc[0] != cur->field_poc[1]){
7818                 /* Derive top_field_first from field pocs. */
7819                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7820             }else{
7821                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7822                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7823                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7824                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7825                         cur->top_field_first = 1;
7826                     else
7827                         cur->top_field_first = 0;
7828                 }else{
7829                     /* Most likely progressive */
7830                     cur->top_field_first = 0;
7831                 }
7832             }
7833
7834         //FIXME do something with unavailable reference frames
7835
7836             /* Sort B-frames into display order */
7837
7838             if(h->sps.bitstream_restriction_flag
7839                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7840                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7841                 s->low_delay = 0;
7842             }
7843
7844             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7845                && !h->sps.bitstream_restriction_flag){
7846                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7847                 s->low_delay= 0;
7848             }
7849
7850             pics = 0;
7851             while(h->delayed_pic[pics]) pics++;
7852
7853             assert(pics <= MAX_DELAYED_PIC_COUNT);
7854
7855             h->delayed_pic[pics++] = cur;
7856             if(cur->reference == 0)
7857                 cur->reference = DELAYED_PIC_REF;
7858
7859             out = h->delayed_pic[0];
7860             out_idx = 0;
7861             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7862                 if(h->delayed_pic[i]->poc < out->poc){
7863                     out = h->delayed_pic[i];
7864                     out_idx = i;
7865                 }
7866             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7867
7868             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7869
7870             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7871                 { }
7872             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7873                || (s->low_delay &&
7874                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7875                  || cur->pict_type == FF_B_TYPE)))
7876             {
7877                 s->low_delay = 0;
7878                 s->avctx->has_b_frames++;
7879             }
7880
7881             if(out_of_order || pics > s->avctx->has_b_frames){
7882                 out->reference &= ~DELAYED_PIC_REF;
7883                 for(i=out_idx; h->delayed_pic[i]; i++)
7884                     h->delayed_pic[i] = h->delayed_pic[i+1];
7885             }
7886             if(!out_of_order && pics > s->avctx->has_b_frames){
7887                 *data_size = sizeof(AVFrame);
7888
7889                 h->outputed_poc = out->poc;
7890                 *pict= *(AVFrame*)out;
7891             }else{
7892                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7893             }
7894         }
7895     }
7896
7897     assert(pict->data[0] || !*data_size);
7898     ff_print_debug_info(s, pict);
7899 //printf("out %d\n", (int)pict->data[0]);
7900 #if 0 //?
7901
7902     /* Return the Picture timestamp as the frame number */
7903     /* we subtract 1 because it is added on utils.c     */
7904     avctx->frame_number = s->picture_number - 1;
7905 #endif
7906     return get_consumed_bytes(s, buf_index, buf_size);
7907 }
7908 #if 0
7909 static inline void fill_mb_avail(H264Context *h){
7910     MpegEncContext * const s = &h->s;
7911     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7912
7913     if(s->mb_y){
7914         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7915         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7916         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7917     }else{
7918         h->mb_avail[0]=
7919         h->mb_avail[1]=
7920         h->mb_avail[2]= 0;
7921     }
7922     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7923     h->mb_avail[4]= 1; //FIXME move out
7924     h->mb_avail[5]= 0; //FIXME move out
7925 }
7926 #endif
7927
7928 #ifdef TEST
7929 #undef printf
7930 #undef random
7931 #define COUNT 8000
7932 #define SIZE (COUNT*40)
7933 int main(void){
7934     int i;
7935     uint8_t temp[SIZE];
7936     PutBitContext pb;
7937     GetBitContext gb;
7938 //    int int_temp[10000];
7939     DSPContext dsp;
7940     AVCodecContext avctx;
7941
7942     dsputil_init(&dsp, &avctx);
7943
7944     init_put_bits(&pb, temp, SIZE);
7945     printf("testing unsigned exp golomb\n");
7946     for(i=0; i<COUNT; i++){
7947         START_TIMER
7948         set_ue_golomb(&pb, i);
7949         STOP_TIMER("set_ue_golomb");
7950     }
7951     flush_put_bits(&pb);
7952
7953     init_get_bits(&gb, temp, 8*SIZE);
7954     for(i=0; i<COUNT; i++){
7955         int j, s;
7956
7957         s= show_bits(&gb, 24);
7958
7959         START_TIMER
7960         j= get_ue_golomb(&gb);
7961         if(j != i){
7962             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7963 //            return -1;
7964         }
7965         STOP_TIMER("get_ue_golomb");
7966     }
7967
7968
7969     init_put_bits(&pb, temp, SIZE);
7970     printf("testing signed exp golomb\n");
7971     for(i=0; i<COUNT; i++){
7972         START_TIMER
7973         set_se_golomb(&pb, i - COUNT/2);
7974         STOP_TIMER("set_se_golomb");
7975     }
7976     flush_put_bits(&pb);
7977
7978     init_get_bits(&gb, temp, 8*SIZE);
7979     for(i=0; i<COUNT; i++){
7980         int j, s;
7981
7982         s= show_bits(&gb, 24);
7983
7984         START_TIMER
7985         j= get_se_golomb(&gb);
7986         if(j != i - COUNT/2){
7987             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7988 //            return -1;
7989         }
7990         STOP_TIMER("get_se_golomb");
7991     }
7992
7993 #if 0
7994     printf("testing 4x4 (I)DCT\n");
7995
7996     DCTELEM block[16];
7997     uint8_t src[16], ref[16];
7998     uint64_t error= 0, max_error=0;
7999
8000     for(i=0; i<COUNT; i++){
8001         int j;
8002 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8003         for(j=0; j<16; j++){
8004             ref[j]= random()%255;
8005             src[j]= random()%255;
8006         }
8007
8008         h264_diff_dct_c(block, src, ref, 4);
8009
8010         //normalize
8011         for(j=0; j<16; j++){
8012 //            printf("%d ", block[j]);
8013             block[j]= block[j]*4;
8014             if(j&1) block[j]= (block[j]*4 + 2)/5;
8015             if(j&4) block[j]= (block[j]*4 + 2)/5;
8016         }
8017 //        printf("\n");
8018
8019         s->dsp.h264_idct_add(ref, block, 4);
8020 /*        for(j=0; j<16; j++){
8021             printf("%d ", ref[j]);
8022         }
8023         printf("\n");*/
8024
8025         for(j=0; j<16; j++){
8026             int diff= FFABS(src[j] - ref[j]);
8027
8028             error+= diff*diff;
8029             max_error= FFMAX(max_error, diff);
8030         }
8031     }
8032     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8033     printf("testing quantizer\n");
8034     for(qp=0; qp<52; qp++){
8035         for(i=0; i<16; i++)
8036             src1_block[i]= src2_block[i]= random()%255;
8037
8038     }
8039     printf("Testing NAL layer\n");
8040
8041     uint8_t bitstream[COUNT];
8042     uint8_t nal[COUNT*2];
8043     H264Context h;
8044     memset(&h, 0, sizeof(H264Context));
8045
8046     for(i=0; i<COUNT; i++){
8047         int zeros= i;
8048         int nal_length;
8049         int consumed;
8050         int out_length;
8051         uint8_t *out;
8052         int j;
8053
8054         for(j=0; j<COUNT; j++){
8055             bitstream[j]= (random() % 255) + 1;
8056         }
8057
8058         for(j=0; j<zeros; j++){
8059             int pos= random() % COUNT;
8060             while(bitstream[pos] == 0){
8061                 pos++;
8062                 pos %= COUNT;
8063             }
8064             bitstream[pos]=0;
8065         }
8066
8067         START_TIMER
8068
8069         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8070         if(nal_length<0){
8071             printf("encoding failed\n");
8072             return -1;
8073         }
8074
8075         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8076
8077         STOP_TIMER("NAL")
8078
8079         if(out_length != COUNT){
8080             printf("incorrect length %d %d\n", out_length, COUNT);
8081             return -1;
8082         }
8083
8084         if(consumed != nal_length){
8085             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8086             return -1;
8087         }
8088
8089         if(memcmp(bitstream, out, COUNT)){
8090             printf("mismatch\n");
8091             return -1;
8092         }
8093     }
8094 #endif
8095
8096     printf("Testing RBSP\n");
8097
8098
8099     return 0;
8100 }
8101 #endif /* TEST */
8102
8103
8104 av_cold void ff_h264_free_context(H264Context *h)
8105 {
8106     int i;
8107
8108     av_freep(&h->rbsp_buffer[0]);
8109     av_freep(&h->rbsp_buffer[1]);
8110     free_tables(h); //FIXME cleanup init stuff perhaps
8111
8112     for(i = 0; i < MAX_SPS_COUNT; i++)
8113         av_freep(h->sps_buffers + i);
8114
8115     for(i = 0; i < MAX_PPS_COUNT; i++)
8116         av_freep(h->pps_buffers + i);
8117 }
8118
8119 static av_cold int decode_end(AVCodecContext *avctx)
8120 {
8121     H264Context *h = avctx->priv_data;
8122     MpegEncContext *s = &h->s;
8123
8124     ff_h264_free_context(h);
8125
8126     MPV_common_end(s);
8127
8128 //    memset(h, 0, sizeof(H264Context));
8129
8130     return 0;
8131 }
8132
8133
8134 AVCodec h264_decoder = {
8135     "h264",
8136     CODEC_TYPE_VIDEO,
8137     CODEC_ID_H264,
8138     sizeof(H264Context),
8139     decode_init,
8140     NULL,
8141     decode_end,
8142     decode_frame,
8143     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8144     .flush= flush_dpb,
8145     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8146     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8147 };
8148
8149 #if CONFIG_H264_VDPAU_DECODER
8150 AVCodec h264_vdpau_decoder = {
8151     "h264_vdpau",
8152     CODEC_TYPE_VIDEO,
8153     CODEC_ID_H264,
8154     sizeof(H264Context),
8155     decode_init,
8156     NULL,
8157     decode_end,
8158     decode_frame,
8159     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8160     .flush= flush_dpb,
8161     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8162 };
8163 #endif
8164
8165 #if CONFIG_SVQ3_DECODER
8166 #include "svq3.c"
8167 #endif