libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #ifdef WORDS_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996     assert(h->ref_list[1][0].reference&3);
 997
 998 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 999
1000     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1001         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1002             int cur_poc = s->current_picture_ptr->poc;
1003             int *col_poc = h->ref_list[1]->field_poc;
1004             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1005             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1006             b8_stride = 0;
1007         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1008             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1009             mb_xy += s->mb_stride*fieldoff;
1010         }
1011         goto single_col;
1012     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1013         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1014             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1016             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1017             b8_stride *= 3;
1018             b4_stride *= 6;
1019             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1020             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1021                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1022                 && !is_b8x8){
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1025             }else{
1026                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1027                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1028             }
1029         }else{                                           //     AFR/FR    -> AFR/FR
1030 single_col:
1031             mb_type_col[0] =
1032             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1033             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1034                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1035                 * so we know exactly what block size to use */
1036                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1037                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1038             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1041             }else{
1042                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1043                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1044             }
1045         }
1046     }
1047
1048     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1049     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1050     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1051     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1052     if(!b8_stride){
1053         if(s->mb_y&1){
1054             l1ref0 += h->b8_stride;
1055             l1ref1 += h->b8_stride;
1056             l1mv0  +=  2*b4_stride;
1057             l1mv1  +=  2*b4_stride;
1058         }
1059     }
1060
1061     if(h->direct_spatial_mv_pred){
1062         int ref[2];
1063         int mv[2][2];
1064         int list;
1065
1066         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1067
1068         /* ref = min(neighbors) */
1069         for(list=0; list<2; list++){
1070             int refa = h->ref_cache[list][scan8[0] - 1];
1071             int refb = h->ref_cache[list][scan8[0] - 8];
1072             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1073             if(refc == PART_NOT_AVAILABLE)
1074                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1075             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1076             if(ref[list] < 0)
1077                 ref[list] = -1;
1078         }
1079
1080         if(ref[0] < 0 && ref[1] < 0){
1081             ref[0] = ref[1] = 0;
1082             mv[0][0] = mv[0][1] =
1083             mv[1][0] = mv[1][1] = 0;
1084         }else{
1085             for(list=0; list<2; list++){
1086                 if(ref[list] >= 0)
1087                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1088                 else
1089                     mv[list][0] = mv[list][1] = 0;
1090             }
1091         }
1092
1093         if(ref[1] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L1;
1096             sub_mb_type &= ~MB_TYPE_L1;
1097         }else if(ref[0] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L0;
1100             sub_mb_type &= ~MB_TYPE_L0;
1101         }
1102
1103         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1104             for(i8=0; i8<4; i8++){
1105                 int x8 = i8&1;
1106                 int y8 = i8>>1;
1107                 int xy8 = x8+y8*b8_stride;
1108                 int xy4 = 3*x8+y8*b4_stride;
1109                 int a=0, b=0;
1110
1111                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1112                     continue;
1113                 h->sub_mb_type[i8] = sub_mb_type;
1114
1115                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1116                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1117                 if(!IS_INTRA(mb_type_col[y8])
1118                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1119                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1120                     if(ref[0] > 0)
1121                         a= pack16to32(mv[0][0],mv[0][1]);
1122                     if(ref[1] > 0)
1123                         b= pack16to32(mv[1][0],mv[1][1]);
1124                 }else{
1125                     a= pack16to32(mv[0][0],mv[0][1]);
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127                 }
1128                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1129                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1130             }
1131         }else if(IS_16X16(*mb_type)){
1132             int a=0, b=0;
1133
1134             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1135             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1136             if(!IS_INTRA(mb_type_col[0])
1137                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1138                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1139                        && (h->x264_build>33 || !h->x264_build)))){
1140                 if(ref[0] > 0)
1141                     a= pack16to32(mv[0][0],mv[0][1]);
1142                 if(ref[1] > 0)
1143                     b= pack16to32(mv[1][0],mv[1][1]);
1144             }else{
1145                 a= pack16to32(mv[0][0],mv[0][1]);
1146                 b= pack16to32(mv[1][0],mv[1][1]);
1147             }
1148             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1149             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1150         }else{
1151             for(i8=0; i8<4; i8++){
1152                 const int x8 = i8&1;
1153                 const int y8 = i8>>1;
1154
1155                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1156                     continue;
1157                 h->sub_mb_type[i8] = sub_mb_type;
1158
1159                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1160                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1161                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1162                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1163
1164                 /* col_zero_flag */
1165                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1166                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1167                                                   && (h->x264_build>33 || !h->x264_build)))){
1168                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1169                     if(IS_SUB_8X8(sub_mb_type)){
1170                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1171                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1172                             if(ref[0] == 0)
1173                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                             if(ref[1] == 0)
1175                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1176                         }
1177                     }else
1178                     for(i4=0; i4<4; i4++){
1179                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1180                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1181                             if(ref[0] == 0)
1182                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1183                             if(ref[1] == 0)
1184                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1185                         }
1186                     }
1187                 }
1188             }
1189         }
1190     }else{ /* direct temporal mv pred */
1191         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1192         const int *dist_scale_factor = h->dist_scale_factor;
1193         int ref_offset= 0;
1194
1195         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1196             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1197             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1198             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1199         }
1200         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1201             ref_offset += 16;
1202
1203         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1204             /* FIXME assumes direct_8x8_inference == 1 */
1205             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1206
1207             for(i8=0; i8<4; i8++){
1208                 const int x8 = i8&1;
1209                 const int y8 = i8>>1;
1210                 int ref0, scale;
1211                 const int16_t (*l1mv)[2]= l1mv0;
1212
1213                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1214                     continue;
1215                 h->sub_mb_type[i8] = sub_mb_type;
1216
1217                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                 if(IS_INTRA(mb_type_col[y8])){
1219                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1221                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1222                     continue;
1223                 }
1224
1225                 ref0 = l1ref0[x8 + y8*b8_stride];
1226                 if(ref0 >= 0)
1227                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1228                 else{
1229                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1230                     l1mv= l1mv1;
1231                 }
1232                 scale = dist_scale_factor[ref0];
1233                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1234
1235                 {
1236                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1237                     int my_col = (mv_col[1]<<y_shift)/2;
1238                     int mx = (scale * mv_col[0] + 128) >> 8;
1239                     int my = (scale * my_col + 128) >> 8;
1240                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1241                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1242                 }
1243             }
1244             return;
1245         }
1246
1247         /* one-to-one mv scaling */
1248
1249         if(IS_16X16(*mb_type)){
1250             int ref, mv0, mv1;
1251
1252             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1253             if(IS_INTRA(mb_type_col[0])){
1254                 ref=mv0=mv1=0;
1255             }else{
1256                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1257                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1258                 const int scale = dist_scale_factor[ref0];
1259                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1260                 int mv_l0[2];
1261                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1262                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1263                 ref= ref0;
1264                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1265                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1266             }
1267             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1268             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1269             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1270         }else{
1271             for(i8=0; i8<4; i8++){
1272                 const int x8 = i8&1;
1273                 const int y8 = i8>>1;
1274                 int ref0, scale;
1275                 const int16_t (*l1mv)[2]= l1mv0;
1276
1277                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1278                     continue;
1279                 h->sub_mb_type[i8] = sub_mb_type;
1280                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                 if(IS_INTRA(mb_type_col[0])){
1282                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1283                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1284                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1285                     continue;
1286                 }
1287
1288                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1289                 if(ref0 >= 0)
1290                     ref0 = map_col_to_list0[0][ref0];
1291                 else{
1292                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1293                     l1mv= l1mv1;
1294                 }
1295                 scale = dist_scale_factor[ref0];
1296
1297                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1298                 if(IS_SUB_8X8(sub_mb_type)){
1299                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1300                     int mx = (scale * mv_col[0] + 128) >> 8;
1301                     int my = (scale * mv_col[1] + 128) >> 8;
1302                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1303                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1304                 }else
1305                 for(i4=0; i4<4; i4++){
1306                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1307                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1308                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1309                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1310                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1311                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1312                 }
1313             }
1314         }
1315     }
1316 }
1317
1318 static inline void write_back_motion(H264Context *h, int mb_type){
1319     MpegEncContext * const s = &h->s;
1320     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1321     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1322     int list;
1323
1324     if(!USES_LIST(mb_type, 0))
1325         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1326
1327     for(list=0; list<h->list_count; list++){
1328         int y;
1329         if(!USES_LIST(mb_type, list))
1330             continue;
1331
1332         for(y=0; y<4; y++){
1333             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1334             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1335         }
1336         if( h->pps.cabac ) {
1337             if(IS_SKIP(mb_type))
1338                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1339             else
1340             for(y=0; y<4; y++){
1341                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1342                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1343             }
1344         }
1345
1346         {
1347             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1348             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1349             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1350             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1351             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1352         }
1353     }
1354
1355     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1356         if(IS_8X8(mb_type)){
1357             uint8_t *direct_table = &h->direct_table[b8_xy];
1358             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1359             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1360             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1361         }
1362     }
1363 }
1364
1365 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1366     int i, si, di;
1367     uint8_t *dst;
1368     int bufidx;
1369
1370 //    src[0]&0x80;                //forbidden bit
1371     h->nal_ref_idc= src[0]>>5;
1372     h->nal_unit_type= src[0]&0x1F;
1373
1374     src++; length--;
1375 #if 0
1376     for(i=0; i<length; i++)
1377         printf("%2X ", src[i]);
1378 #endif
1379
1380 #if HAVE_FAST_UNALIGNED
1381 # if HAVE_FAST_64BIT
1382 #   define RS 7
1383     for(i=0; i+1<length; i+=9){
1384         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1385 # else
1386 #   define RS 3
1387     for(i=0; i+1<length; i+=5){
1388         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1389 # endif
1390             continue;
1391         if(i>0 && !src[i]) i--;
1392         while(src[i]) i++;
1393 #else
1394 #   define RS 0
1395     for(i=0; i+1<length; i+=2){
1396         if(src[i]) continue;
1397         if(i>0 && src[i-1]==0) i--;
1398 #endif
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406         i-= RS;
1407     }
1408
1409     if(i>=length-1){ //no escaped 0
1410         *dst_length= length;
1411         *consumed= length+1; //+1 for the header
1412         return src;
1413     }
1414
1415     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1416     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1417     dst= h->rbsp_buffer[bufidx];
1418
1419     if (dst == NULL){
1420         return NULL;
1421     }
1422
1423 //printf("decoding esc\n");
1424     memcpy(dst, src, i);
1425     si=di=i;
1426     while(si+2<length){
1427         //remove escapes (very rare 1:2^22)
1428         if(src[si+2]>3){
1429             dst[di++]= src[si++];
1430             dst[di++]= src[si++];
1431         }else if(src[si]==0 && src[si+1]==0){
1432             if(src[si+2]==3){ //escape
1433                 dst[di++]= 0;
1434                 dst[di++]= 0;
1435                 si+=3;
1436                 continue;
1437             }else //next start code
1438                 goto nsc;
1439         }
1440
1441         dst[di++]= src[si++];
1442     }
1443     while(si<length)
1444         dst[di++]= src[si++];
1445 nsc:
1446
1447     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1448
1449     *dst_length= di;
1450     *consumed= si + 1;//+1 for the header
1451 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1452     return dst;
1453 }
1454
1455 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1456     int v= *src;
1457     int r;
1458
1459     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1460
1461     for(r=1; r<9; r++){
1462         if(v&1) return r;
1463         v>>=1;
1464     }
1465     return 0;
1466 }
1467
1468 /**
1469  * IDCT transforms the 16 dc values and dequantizes them.
1470  * @param qp quantization parameter
1471  */
1472 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1473 #define stride 16
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479 //memset(block, 64, 2*256);
1480 //return;
1481     for(i=0; i<4; i++){
1482         const int offset= y_offset[i];
1483         const int z0= block[offset+stride*0] + block[offset+stride*4];
1484         const int z1= block[offset+stride*0] - block[offset+stride*4];
1485         const int z2= block[offset+stride*1] - block[offset+stride*5];
1486         const int z3= block[offset+stride*1] + block[offset+stride*5];
1487
1488         temp[4*i+0]= z0+z3;
1489         temp[4*i+1]= z1+z2;
1490         temp[4*i+2]= z1-z2;
1491         temp[4*i+3]= z0-z3;
1492     }
1493
1494     for(i=0; i<4; i++){
1495         const int offset= x_offset[i];
1496         const int z0= temp[4*0+i] + temp[4*2+i];
1497         const int z1= temp[4*0+i] - temp[4*2+i];
1498         const int z2= temp[4*1+i] - temp[4*3+i];
1499         const int z3= temp[4*1+i] + temp[4*3+i];
1500
1501         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1502         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1503         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1504         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1505     }
1506 }
1507
1508 #if 0
1509 /**
1510  * DCT transforms the 16 dc values.
1511  * @param qp quantization parameter ??? FIXME
1512  */
1513 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1514 //    const int qmul= dequant_coeff[qp][0];
1515     int i;
1516     int temp[16]; //FIXME check if this is a good idea
1517     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1518     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1519
1520     for(i=0; i<4; i++){
1521         const int offset= y_offset[i];
1522         const int z0= block[offset+stride*0] + block[offset+stride*4];
1523         const int z1= block[offset+stride*0] - block[offset+stride*4];
1524         const int z2= block[offset+stride*1] - block[offset+stride*5];
1525         const int z3= block[offset+stride*1] + block[offset+stride*5];
1526
1527         temp[4*i+0]= z0+z3;
1528         temp[4*i+1]= z1+z2;
1529         temp[4*i+2]= z1-z2;
1530         temp[4*i+3]= z0-z3;
1531     }
1532
1533     for(i=0; i<4; i++){
1534         const int offset= x_offset[i];
1535         const int z0= temp[4*0+i] + temp[4*2+i];
1536         const int z1= temp[4*0+i] - temp[4*2+i];
1537         const int z2= temp[4*1+i] - temp[4*3+i];
1538         const int z3= temp[4*1+i] + temp[4*3+i];
1539
1540         block[stride*0 +offset]= (z0 + z3)>>1;
1541         block[stride*2 +offset]= (z1 + z2)>>1;
1542         block[stride*8 +offset]= (z1 - z2)>>1;
1543         block[stride*10+offset]= (z0 - z3)>>1;
1544     }
1545 }
1546 #endif
1547
1548 #undef xStride
1549 #undef stride
1550
1551 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1552     const int stride= 16*2;
1553     const int xStride= 16;
1554     int a,b,c,d,e;
1555
1556     a= block[stride*0 + xStride*0];
1557     b= block[stride*0 + xStride*1];
1558     c= block[stride*1 + xStride*0];
1559     d= block[stride*1 + xStride*1];
1560
1561     e= a-b;
1562     a= a+b;
1563     b= c-d;
1564     c= c+d;
1565
1566     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1567     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1568     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1569     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1570 }
1571
1572 #if 0
1573 static void chroma_dc_dct_c(DCTELEM *block){
1574     const int stride= 16*2;
1575     const int xStride= 16;
1576     int a,b,c,d,e;
1577
1578     a= block[stride*0 + xStride*0];
1579     b= block[stride*0 + xStride*1];
1580     c= block[stride*1 + xStride*0];
1581     d= block[stride*1 + xStride*1];
1582
1583     e= a-b;
1584     a= a+b;
1585     b= c-d;
1586     c= c+d;
1587
1588     block[stride*0 + xStride*0]= (a+c);
1589     block[stride*0 + xStride*1]= (e+b);
1590     block[stride*1 + xStride*0]= (a-c);
1591     block[stride*1 + xStride*1]= (e-b);
1592 }
1593 #endif
1594
1595 /**
1596  * gets the chroma qp.
1597  */
1598 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1599     return h->pps.chroma_qp_table[t][qscale];
1600 }
1601
1602 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1603                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1604                            int src_x_offset, int src_y_offset,
1605                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1606     MpegEncContext * const s = &h->s;
1607     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1608     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1609     const int luma_xy= (mx&3) + ((my&3)<<2);
1610     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1611     uint8_t * src_cb, * src_cr;
1612     int extra_width= h->emu_edge_width;
1613     int extra_height= h->emu_edge_height;
1614     int emu=0;
1615     const int full_mx= mx>>2;
1616     const int full_my= my>>2;
1617     const int pic_width  = 16*s->mb_width;
1618     const int pic_height = 16*s->mb_height >> MB_FIELD;
1619
1620     if(mx&7) extra_width -= 3;
1621     if(my&7) extra_height -= 3;
1622
1623     if(   full_mx < 0-extra_width
1624        || full_my < 0-extra_height
1625        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1626        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1627         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1628             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1629         emu=1;
1630     }
1631
1632     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1633     if(!square){
1634         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1635     }
1636
1637     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1638
1639     if(MB_FIELD){
1640         // chroma offset when predicting from a field of opposite parity
1641         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1642         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1643     }
1644     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1645     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1646
1647     if(emu){
1648         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1649             src_cb= s->edge_emu_buffer;
1650     }
1651     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cr= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658 }
1659
1660 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1661                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1662                            int x_offset, int y_offset,
1663                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1664                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1665                            int list0, int list1){
1666     MpegEncContext * const s = &h->s;
1667     qpel_mc_func *qpix_op=  qpix_put;
1668     h264_chroma_mc_func chroma_op= chroma_put;
1669
1670     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1671     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1672     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1673     x_offset += 8*s->mb_x;
1674     y_offset += 8*(s->mb_y >> MB_FIELD);
1675
1676     if(list0){
1677         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681
1682         qpix_op=  qpix_avg;
1683         chroma_op= chroma_avg;
1684     }
1685
1686     if(list1){
1687         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1688         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1689                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1690                            qpix_op, chroma_op);
1691     }
1692 }
1693
1694 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1699                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1700                            int list0, int list1){
1701     MpegEncContext * const s = &h->s;
1702
1703     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1704     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1705     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     x_offset += 8*s->mb_x;
1707     y_offset += 8*(s->mb_y >> MB_FIELD);
1708
1709     if(list0 && list1){
1710         /* don't optimize for luma-only case, since B-frames usually
1711          * use implicit weights => chroma too. */
1712         uint8_t *tmp_cb = s->obmc_scratchpad;
1713         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1714         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1715         int refn0 = h->ref_cache[0][ scan8[n] ];
1716         int refn1 = h->ref_cache[1][ scan8[n] ];
1717
1718         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1719                     dest_y, dest_cb, dest_cr,
1720                     x_offset, y_offset, qpix_put, chroma_put);
1721         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1722                     tmp_y, tmp_cb, tmp_cr,
1723                     x_offset, y_offset, qpix_put, chroma_put);
1724
1725         if(h->use_weight == 2){
1726             int weight0 = h->implicit_weight[refn0][refn1];
1727             int weight1 = 64 - weight0;
1728             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1729             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1730             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1731         }else{
1732             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1733                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1734                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1736                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1737                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1739                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1740                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1741         }
1742     }else{
1743         int list = list1 ? 1 : 0;
1744         int refn = h->ref_cache[list][ scan8[n] ];
1745         Picture *ref= &h->ref_list[list][refn];
1746         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1747                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1748                     qpix_put, chroma_put);
1749
1750         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1751                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1752         if(h->use_weight_chroma){
1753             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1755             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1756                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1757         }
1758     }
1759 }
1760
1761 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1762                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1763                            int x_offset, int y_offset,
1764                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1765                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1766                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1767                            int list0, int list1){
1768     if((h->use_weight==2 && list0 && list1
1769         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1770        || h->use_weight==1)
1771         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1772                          x_offset, y_offset, qpix_put, chroma_put,
1773                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1774     else
1775         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1776                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1777 }
1778
1779 static inline void prefetch_motion(H264Context *h, int list){
1780     /* fetch pixels for estimated mv 4 macroblocks ahead
1781      * optimized for 64byte cache lines */
1782     MpegEncContext * const s = &h->s;
1783     const int refn = h->ref_cache[list][scan8[0]];
1784     if(refn >= 0){
1785         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1786         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1787         uint8_t **src= h->ref_list[list][refn].data;
1788         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1789         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1790         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1791         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1792     }
1793 }
1794
1795 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1796                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1797                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1798                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1799     MpegEncContext * const s = &h->s;
1800     const int mb_xy= h->mb_xy;
1801     const int mb_type= s->current_picture.mb_type[mb_xy];
1802
1803     assert(IS_INTER(mb_type));
1804
1805     prefetch_motion(h, 0);
1806
1807     if(IS_16X16(mb_type)){
1808         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1809                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1810                 &weight_op[0], &weight_avg[0],
1811                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1812     }else if(IS_16X8(mb_type)){
1813         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1815                 &weight_op[1], &weight_avg[1],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1818                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1819                 &weight_op[1], &weight_avg[1],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else if(IS_8X16(mb_type)){
1822         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1823                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1824                 &weight_op[2], &weight_avg[2],
1825                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1826         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1827                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1828                 &weight_op[2], &weight_avg[2],
1829                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1830     }else{
1831         int i;
1832
1833         assert(IS_8X8(mb_type));
1834
1835         for(i=0; i<4; i++){
1836             const int sub_mb_type= h->sub_mb_type[i];
1837             const int n= 4*i;
1838             int x_offset= (i&1)<<2;
1839             int y_offset= (i&2)<<1;
1840
1841             if(IS_SUB_8X8(sub_mb_type)){
1842                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1843                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1844                     &weight_op[3], &weight_avg[3],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_8X4(sub_mb_type)){
1847                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1849                     &weight_op[4], &weight_avg[4],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1852                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1853                     &weight_op[4], &weight_avg[4],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else if(IS_SUB_4X8(sub_mb_type)){
1856                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1857                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1858                     &weight_op[5], &weight_avg[5],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1861                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                     &weight_op[5], &weight_avg[5],
1863                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864             }else{
1865                 int j;
1866                 assert(IS_SUB_4X4(sub_mb_type));
1867                 for(j=0; j<4; j++){
1868                     int sub_x_offset= x_offset + 2*(j&1);
1869                     int sub_y_offset= y_offset +   (j&2);
1870                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1871                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1872                         &weight_op[6], &weight_avg[6],
1873                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1874                 }
1875             }
1876         }
1877     }
1878
1879     prefetch_motion(h, 1);
1880 }
1881
1882 static av_cold void init_cavlc_level_tab(void){
1883     int suffix_length, mask;
1884     unsigned int i;
1885
1886     for(suffix_length=0; suffix_length<7; suffix_length++){
1887         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1888             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1889             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1890
1891             mask= -(level_code&1);
1892             level_code= (((2+level_code)>>1) ^ mask) - mask;
1893             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1894                 cavlc_level_tab[suffix_length][i][0]= level_code;
1895                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1896             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1897                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1898                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1899             }else{
1900                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1901                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1902             }
1903         }
1904     }
1905 }
1906
1907 static av_cold void decode_init_vlc(void){
1908     static int done = 0;
1909
1910     if (!done) {
1911         int i;
1912         int offset;
1913         done = 1;
1914
1915         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1916         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1917         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1918                  &chroma_dc_coeff_token_len [0], 1, 1,
1919                  &chroma_dc_coeff_token_bits[0], 1, 1,
1920                  INIT_VLC_USE_NEW_STATIC);
1921
1922         offset = 0;
1923         for(i=0; i<4; i++){
1924             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1925             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1926             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1927                      &coeff_token_len [i][0], 1, 1,
1928                      &coeff_token_bits[i][0], 1, 1,
1929                      INIT_VLC_USE_NEW_STATIC);
1930             offset += coeff_token_vlc_tables_size[i];
1931         }
1932         /*
1933          * This is a one time safety check to make sure that
1934          * the packed static coeff_token_vlc table sizes
1935          * were initialized correctly.
1936          */
1937         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1938
1939         for(i=0; i<3; i++){
1940             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1941             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1942             init_vlc(&chroma_dc_total_zeros_vlc[i],
1943                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1944                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1945                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1946                      INIT_VLC_USE_NEW_STATIC);
1947         }
1948         for(i=0; i<15; i++){
1949             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1950             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1951             init_vlc(&total_zeros_vlc[i],
1952                      TOTAL_ZEROS_VLC_BITS, 16,
1953                      &total_zeros_len [i][0], 1, 1,
1954                      &total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957
1958         for(i=0; i<6; i++){
1959             run_vlc[i].table = run_vlc_tables[i];
1960             run_vlc[i].table_allocated = run_vlc_tables_size;
1961             init_vlc(&run_vlc[i],
1962                      RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1,
1965                      INIT_VLC_USE_NEW_STATIC);
1966         }
1967         run7_vlc.table = run7_vlc_table,
1968         run7_vlc.table_allocated = run7_vlc_table_size;
1969         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1970                  &run_len [6][0], 1, 1,
1971                  &run_bits[6][0], 1, 1,
1972                  INIT_VLC_USE_NEW_STATIC);
1973
1974         init_cavlc_level_tab();
1975     }
1976 }
1977
1978 static void free_tables(H264Context *h){
1979     int i;
1980     H264Context *hx;
1981     av_freep(&h->intra4x4_pred_mode);
1982     av_freep(&h->chroma_pred_mode_table);
1983     av_freep(&h->cbp_table);
1984     av_freep(&h->mvd_table[0]);
1985     av_freep(&h->mvd_table[1]);
1986     av_freep(&h->direct_table);
1987     av_freep(&h->non_zero_count);
1988     av_freep(&h->slice_table_base);
1989     h->slice_table= NULL;
1990
1991     av_freep(&h->mb2b_xy);
1992     av_freep(&h->mb2b8_xy);
1993
1994     for(i = 0; i < MAX_THREADS; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000     }
2001 }
2002
2003 static void init_dequant8_coeff_table(H264Context *h){
2004     int i,q,x;
2005     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2006     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2007     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2008
2009     for(i=0; i<2; i++ ){
2010         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2011             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2012             break;
2013         }
2014
2015         for(q=0; q<52; q++){
2016             int shift = div6[q];
2017             int idx = rem6[q];
2018             for(x=0; x<64; x++)
2019                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2020                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2021                     h->pps.scaling_matrix8[i][x]) << shift;
2022         }
2023     }
2024 }
2025
2026 static void init_dequant4_coeff_table(H264Context *h){
2027     int i,j,q,x;
2028     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2029     for(i=0; i<6; i++ ){
2030         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2031         for(j=0; j<i; j++){
2032             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2033                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2034                 break;
2035             }
2036         }
2037         if(j<i)
2038             continue;
2039
2040         for(q=0; q<52; q++){
2041             int shift = div6[q] + 2;
2042             int idx = rem6[q];
2043             for(x=0; x<16; x++)
2044                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2045                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2046                     h->pps.scaling_matrix4[i][x]) << shift;
2047         }
2048     }
2049 }
2050
2051 static void init_dequant_tables(H264Context *h){
2052     int i,x;
2053     init_dequant4_coeff_table(h);
2054     if(h->pps.transform_8x8_mode)
2055         init_dequant8_coeff_table(h);
2056     if(h->sps.transform_bypass){
2057         for(i=0; i<6; i++)
2058             for(x=0; x<16; x++)
2059                 h->dequant4_coeff[i][0][x] = 1<<6;
2060         if(h->pps.transform_8x8_mode)
2061             for(i=0; i<2; i++)
2062                 for(x=0; x<64; x++)
2063                     h->dequant8_coeff[i][0][x] = 1<<6;
2064     }
2065 }
2066
2067
2068 /**
2069  * allocates tables.
2070  * needs width/height
2071  */
2072 static int alloc_tables(H264Context *h){
2073     MpegEncContext * const s = &h->s;
2074     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2075     int x,y;
2076
2077     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2078
2079     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2080     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2081     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2082
2083     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2084     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2085     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2086     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2087
2088     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2089     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2090
2091     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2092     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2093     for(y=0; y<s->mb_height; y++){
2094         for(x=0; x<s->mb_width; x++){
2095             const int mb_xy= x + y*s->mb_stride;
2096             const int b_xy = 4*x + 4*y*h->b_stride;
2097             const int b8_xy= 2*x + 2*y*h->b8_stride;
2098
2099             h->mb2b_xy [mb_xy]= b_xy;
2100             h->mb2b8_xy[mb_xy]= b8_xy;
2101         }
2102     }
2103
2104     s->obmc_scratchpad = NULL;
2105
2106     if(!h->dequant4_coeff[0])
2107         init_dequant_tables(h);
2108
2109     return 0;
2110 fail:
2111     free_tables(h);
2112     return -1;
2113 }
2114
2115 /**
2116  * Mimic alloc_tables(), but for every context thread.
2117  */
2118 static void clone_tables(H264Context *dst, H264Context *src){
2119     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2120     dst->non_zero_count           = src->non_zero_count;
2121     dst->slice_table              = src->slice_table;
2122     dst->cbp_table                = src->cbp_table;
2123     dst->mb2b_xy                  = src->mb2b_xy;
2124     dst->mb2b8_xy                 = src->mb2b8_xy;
2125     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2126     dst->mvd_table[0]             = src->mvd_table[0];
2127     dst->mvd_table[1]             = src->mvd_table[1];
2128     dst->direct_table             = src->direct_table;
2129
2130     dst->s.obmc_scratchpad = NULL;
2131     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2132 }
2133
2134 /**
2135  * Init context
2136  * Allocate buffers which are not shared amongst multiple threads.
2137  */
2138 static int context_init(H264Context *h){
2139     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2140     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2141
2142     return 0;
2143 fail:
2144     return -1; // free_tables will clean up for us
2145 }
2146
2147 static av_cold void common_init(H264Context *h){
2148     MpegEncContext * const s = &h->s;
2149
2150     s->width = s->avctx->width;
2151     s->height = s->avctx->height;
2152     s->codec_id= s->avctx->codec->id;
2153
2154     ff_h264_pred_init(&h->hpc, s->codec_id);
2155
2156     h->dequant_coeff_pps= -1;
2157     s->unrestricted_mv=1;
2158     s->decode=1; //FIXME
2159
2160     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2161
2162     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2163     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2164 }
2165
2166 /**
2167  * Reset SEI values at the beginning of the frame.
2168  *
2169  * @param h H.264 context.
2170  */
2171 static void reset_sei(H264Context *h) {
2172     h->sei_recovery_frame_cnt       = -1;
2173     h->sei_dpb_output_delay         =  0;
2174     h->sei_cpb_removal_delay        = -1;
2175     h->sei_buffering_period_present =  0;
2176 }
2177
2178 static av_cold int decode_init(AVCodecContext *avctx){
2179     H264Context *h= avctx->priv_data;
2180     MpegEncContext * const s = &h->s;
2181
2182     MPV_decode_defaults(s);
2183
2184     s->avctx = avctx;
2185     common_init(h);
2186
2187     s->out_format = FMT_H264;
2188     s->workaround_bugs= avctx->workaround_bugs;
2189
2190     // set defaults
2191 //    s->decode_mb= ff_h263_decode_mb;
2192     s->quarter_sample = 1;
2193     if(!avctx->has_b_frames)
2194     s->low_delay= 1;
2195
2196     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2197         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2198     else
2199         avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2200     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2201     avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
2202
2203     decode_init_vlc();
2204
2205     if(avctx->extradata_size > 0 && avctx->extradata &&
2206        *(char *)avctx->extradata == 1){
2207         h->is_avc = 1;
2208         h->got_avcC = 0;
2209     } else {
2210         h->is_avc = 0;
2211     }
2212
2213     h->thread_context[0] = h;
2214     h->outputed_poc = INT_MIN;
2215     h->prev_poc_msb= 1<<16;
2216     reset_sei(h);
2217     if(avctx->codec_id == CODEC_ID_H264){
2218         if(avctx->ticks_per_frame == 1){
2219             s->avctx->time_base.den *=2;
2220         }
2221         avctx->ticks_per_frame = 2;
2222     }
2223     return 0;
2224 }
2225
2226 static int frame_start(H264Context *h){
2227     MpegEncContext * const s = &h->s;
2228     int i;
2229
2230     if(MPV_frame_start(s, s->avctx) < 0)
2231         return -1;
2232     ff_er_frame_start(s);
2233     /*
2234      * MPV_frame_start uses pict_type to derive key_frame.
2235      * This is incorrect for H.264; IDR markings must be used.
2236      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2237      * See decode_nal_units().
2238      */
2239     s->current_picture_ptr->key_frame= 0;
2240
2241     assert(s->linesize && s->uvlinesize);
2242
2243     for(i=0; i<16; i++){
2244         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2245         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2246     }
2247     for(i=0; i<4; i++){
2248         h->block_offset[16+i]=
2249         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2250         h->block_offset[24+16+i]=
2251         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2252     }
2253
2254     /* can't be in alloc_tables because linesize isn't known there.
2255      * FIXME: redo bipred weight to not require extra buffer? */
2256     for(i = 0; i < s->avctx->thread_count; i++)
2257         if(!h->thread_context[i]->s.obmc_scratchpad)
2258             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2259
2260     /* some macroblocks will be accessed before they're available */
2261     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2262         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2263
2264 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2265
2266     // We mark the current picture as non-reference after allocating it, so
2267     // that if we break out due to an error it can be released automatically
2268     // in the next MPV_frame_start().
2269     // SVQ3 as well as most other codecs have only last/next/current and thus
2270     // get released even with set reference, besides SVQ3 and others do not
2271     // mark frames as reference later "naturally".
2272     if(s->codec_id != CODEC_ID_SVQ3)
2273         s->current_picture_ptr->reference= 0;
2274
2275     s->current_picture_ptr->field_poc[0]=
2276     s->current_picture_ptr->field_poc[1]= INT_MAX;
2277     assert(s->current_picture_ptr->long_ref==0);
2278
2279     return 0;
2280 }
2281
2282 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2283     MpegEncContext * const s = &h->s;
2284     int i;
2285     int step    = 1;
2286     int offset  = 1;
2287     int uvoffset= 1;
2288     int top_idx = 1;
2289     int skiplast= 0;
2290
2291     src_y  -=   linesize;
2292     src_cb -= uvlinesize;
2293     src_cr -= uvlinesize;
2294
2295     if(!simple && FRAME_MBAFF){
2296         if(s->mb_y&1){
2297             offset  = MB_MBAFF ? 1 : 17;
2298             uvoffset= MB_MBAFF ? 1 : 9;
2299             if(!MB_MBAFF){
2300                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2301                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2302                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2303                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2304                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2305                 }
2306             }
2307         }else{
2308             if(!MB_MBAFF){
2309                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2310                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2311                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2312                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2313                 }
2314                 skiplast= 1;
2315             }
2316             offset  =
2317             uvoffset=
2318             top_idx = MB_MBAFF ? 0 : 1;
2319         }
2320         step= MB_MBAFF ? 2 : 1;
2321     }
2322
2323     // There are two lines saved, the line above the the top macroblock of a pair,
2324     // and the line above the bottom macroblock
2325     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2326     for(i=1; i<17 - skiplast; i++){
2327         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2328     }
2329
2330     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2331     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2332
2333     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2334         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2335         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2336         for(i=1; i<9 - skiplast; i++){
2337             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2338             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2339         }
2340         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2341         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2342     }
2343 }
2344
2345 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2346     MpegEncContext * const s = &h->s;
2347     int temp8, i;
2348     uint64_t temp64;
2349     int deblock_left;
2350     int deblock_top;
2351     int mb_xy;
2352     int step    = 1;
2353     int offset  = 1;
2354     int uvoffset= 1;
2355     int top_idx = 1;
2356
2357     if(!simple && FRAME_MBAFF){
2358         if(s->mb_y&1){
2359             offset  = MB_MBAFF ? 1 : 17;
2360             uvoffset= MB_MBAFF ? 1 : 9;
2361         }else{
2362             offset  =
2363             uvoffset=
2364             top_idx = MB_MBAFF ? 0 : 1;
2365         }
2366         step= MB_MBAFF ? 2 : 1;
2367     }
2368
2369     if(h->deblocking_filter == 2) {
2370         mb_xy = h->mb_xy;
2371         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2372         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2373     } else {
2374         deblock_left = (s->mb_x > 0);
2375         deblock_top =  (s->mb_y > !!MB_FIELD);
2376     }
2377
2378     src_y  -=   linesize + 1;
2379     src_cb -= uvlinesize + 1;
2380     src_cr -= uvlinesize + 1;
2381
2382 #define XCHG(a,b,t,xchg)\
2383 t= a;\
2384 if(xchg)\
2385     a= b;\
2386 b= t;
2387
2388     if(deblock_left){
2389         for(i = !deblock_top; i<16; i++){
2390             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2391         }
2392         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2393     }
2394
2395     if(deblock_top){
2396         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2397         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2398         if(s->mb_x+1 < s->mb_width){
2399             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2400         }
2401     }
2402
2403     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2404         if(deblock_left){
2405             for(i = !deblock_top; i<8; i++){
2406                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2407                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2408             }
2409             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2410             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2411         }
2412         if(deblock_top){
2413             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2414             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2415         }
2416     }
2417 }
2418
2419 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2420     MpegEncContext * const s = &h->s;
2421     const int mb_x= s->mb_x;
2422     const int mb_y= s->mb_y;
2423     const int mb_xy= h->mb_xy;
2424     const int mb_type= s->current_picture.mb_type[mb_xy];
2425     uint8_t  *dest_y, *dest_cb, *dest_cr;
2426     int linesize, uvlinesize /*dct_offset*/;
2427     int i;
2428     int *block_offset = &h->block_offset[0];
2429     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2430     /* is_h264 should always be true if SVQ3 is disabled. */
2431     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2432     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2433     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2434
2435     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2436     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2437     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2438
2439     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2440     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2441
2442     if (!simple && MB_FIELD) {
2443         linesize   = h->mb_linesize   = s->linesize * 2;
2444         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2445         block_offset = &h->block_offset[24];
2446         if(mb_y&1){ //FIXME move out of this function?
2447             dest_y -= s->linesize*15;
2448             dest_cb-= s->uvlinesize*7;
2449             dest_cr-= s->uvlinesize*7;
2450         }
2451         if(FRAME_MBAFF) {
2452             int list;
2453             for(list=0; list<h->list_count; list++){
2454                 if(!USES_LIST(mb_type, list))
2455                     continue;
2456                 if(IS_16X16(mb_type)){
2457                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2458                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2459                 }else{
2460                     for(i=0; i<16; i+=4){
2461                         int ref = h->ref_cache[list][scan8[i]];
2462                         if(ref >= 0)
2463                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2464                     }
2465                 }
2466             }
2467         }
2468     } else {
2469         linesize   = h->mb_linesize   = s->linesize;
2470         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2471 //        dct_offset = s->linesize * 16;
2472     }
2473
2474     if (!simple && IS_INTRA_PCM(mb_type)) {
2475         for (i=0; i<16; i++) {
2476             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2477         }
2478         for (i=0; i<8; i++) {
2479             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2480             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2481         }
2482     } else {
2483         if(IS_INTRA(mb_type)){
2484             if(h->deblocking_filter)
2485                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2486
2487             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2488                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2489                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2490             }
2491
2492             if(IS_INTRA4x4(mb_type)){
2493                 if(simple || !s->encoding){
2494                     if(IS_8x8DCT(mb_type)){
2495                         if(transform_bypass){
2496                             idct_dc_add =
2497                             idct_add    = s->dsp.add_pixels8;
2498                         }else{
2499                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2500                             idct_add    = s->dsp.h264_idct8_add;
2501                         }
2502                         for(i=0; i<16; i+=4){
2503                             uint8_t * const ptr= dest_y + block_offset[i];
2504                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2505                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2506                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2507                             }else{
2508                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2509                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2510                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2511                                 if(nnz){
2512                                     if(nnz == 1 && h->mb[i*16])
2513                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2514                                     else
2515                                         idct_add   (ptr, h->mb + i*16, linesize);
2516                                 }
2517                             }
2518                         }
2519                     }else{
2520                         if(transform_bypass){
2521                             idct_dc_add =
2522                             idct_add    = s->dsp.add_pixels4;
2523                         }else{
2524                             idct_dc_add = s->dsp.h264_idct_dc_add;
2525                             idct_add    = s->dsp.h264_idct_add;
2526                         }
2527                         for(i=0; i<16; i++){
2528                             uint8_t * const ptr= dest_y + block_offset[i];
2529                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2530
2531                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2532                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2533                             }else{
2534                                 uint8_t *topright;
2535                                 int nnz, tr;
2536                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2537                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2538                                     assert(mb_y || linesize <= block_offset[i]);
2539                                     if(!topright_avail){
2540                                         tr= ptr[3 - linesize]*0x01010101;
2541                                         topright= (uint8_t*) &tr;
2542                                     }else
2543                                         topright= ptr + 4 - linesize;
2544                                 }else
2545                                     topright= NULL;
2546
2547                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2548                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2549                                 if(nnz){
2550                                     if(is_h264){
2551                                         if(nnz == 1 && h->mb[i*16])
2552                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2553                                         else
2554                                             idct_add   (ptr, h->mb + i*16, linesize);
2555                                     }else
2556                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2557                                 }
2558                             }
2559                         }
2560                     }
2561                 }
2562             }else{
2563                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2564                 if(is_h264){
2565                     if(!transform_bypass)
2566                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2567                 }else
2568                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2569             }
2570             if(h->deblocking_filter)
2571                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2572         }else if(is_h264){
2573             hl_motion(h, dest_y, dest_cb, dest_cr,
2574                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2575                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2576                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2577         }
2578
2579
2580         if(!IS_INTRA4x4(mb_type)){
2581             if(is_h264){
2582                 if(IS_INTRA16x16(mb_type)){
2583                     if(transform_bypass){
2584                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2585                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2586                         }else{
2587                             for(i=0; i<16; i++){
2588                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2589                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2590                             }
2591                         }
2592                     }else{
2593                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2594                     }
2595                 }else if(h->cbp&15){
2596                     if(transform_bypass){
2597                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2598                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2599                         for(i=0; i<16; i+=di){
2600                             if(h->non_zero_count_cache[ scan8[i] ]){
2601                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2602                             }
2603                         }
2604                     }else{
2605                         if(IS_8x8DCT(mb_type)){
2606                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2607                         }else{
2608                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2609                         }
2610                     }
2611                 }
2612             }else{
2613                 for(i=0; i<16; i++){
2614                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2615                         uint8_t * const ptr= dest_y + block_offset[i];
2616                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2617                     }
2618                 }
2619             }
2620         }
2621
2622         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2623             uint8_t *dest[2] = {dest_cb, dest_cr};
2624             if(transform_bypass){
2625                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2626                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2627                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2628                 }else{
2629                     idct_add = s->dsp.add_pixels4;
2630                     for(i=16; i<16+8; i++){
2631                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2632                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2633                     }
2634                 }
2635             }else{
2636                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2637                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2638                 if(is_h264){
2639                     idct_add = s->dsp.h264_idct_add;
2640                     idct_dc_add = s->dsp.h264_idct_dc_add;
2641                     for(i=16; i<16+8; i++){
2642                         if(h->non_zero_count_cache[ scan8[i] ])
2643                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2644                         else if(h->mb[i*16])
2645                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2646                     }
2647                 }else{
2648                     for(i=16; i<16+8; i++){
2649                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2650                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2651                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2652                         }
2653                     }
2654                 }
2655             }
2656         }
2657     }
2658     if(h->cbp || IS_INTRA(mb_type))
2659         s->dsp.clear_blocks(h->mb);
2660
2661     if(h->deblocking_filter) {
2662         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2663         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2664         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2665         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2666         if (!simple && FRAME_MBAFF) {
2667             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2668         } else {
2669             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2670         }
2671     }
2672 }
2673
2674 /**
2675  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2676  */
2677 static void hl_decode_mb_simple(H264Context *h){
2678     hl_decode_mb_internal(h, 1);
2679 }
2680
2681 /**
2682  * Process a macroblock; this handles edge cases, such as interlacing.
2683  */
2684 static void av_noinline hl_decode_mb_complex(H264Context *h){
2685     hl_decode_mb_internal(h, 0);
2686 }
2687
2688 static void hl_decode_mb(H264Context *h){
2689     MpegEncContext * const s = &h->s;
2690     const int mb_xy= h->mb_xy;
2691     const int mb_type= s->current_picture.mb_type[mb_xy];
2692     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2693
2694     if (is_complex)
2695         hl_decode_mb_complex(h);
2696     else hl_decode_mb_simple(h);
2697 }
2698
2699 static void pic_as_field(Picture *pic, const int parity){
2700     int i;
2701     for (i = 0; i < 4; ++i) {
2702         if (parity == PICT_BOTTOM_FIELD)
2703             pic->data[i] += pic->linesize[i];
2704         pic->reference = parity;
2705         pic->linesize[i] *= 2;
2706     }
2707     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2708 }
2709
2710 static int split_field_copy(Picture *dest, Picture *src,
2711                             int parity, int id_add){
2712     int match = !!(src->reference & parity);
2713
2714     if (match) {
2715         *dest = *src;
2716         if(parity != PICT_FRAME){
2717             pic_as_field(dest, parity);
2718             dest->pic_id *= 2;
2719             dest->pic_id += id_add;
2720         }
2721     }
2722
2723     return match;
2724 }
2725
2726 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2727     int i[2]={0};
2728     int index=0;
2729
2730     while(i[0]<len || i[1]<len){
2731         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2732             i[0]++;
2733         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2734             i[1]++;
2735         if(i[0] < len){
2736             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2737             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2738         }
2739         if(i[1] < len){
2740             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2741             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2742         }
2743     }
2744
2745     return index;
2746 }
2747
2748 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2749     int i, best_poc;
2750     int out_i= 0;
2751
2752     for(;;){
2753         best_poc= dir ? INT_MIN : INT_MAX;
2754
2755         for(i=0; i<len; i++){
2756             const int poc= src[i]->poc;
2757             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2758                 best_poc= poc;
2759                 sorted[out_i]= src[i];
2760             }
2761         }
2762         if(best_poc == (dir ? INT_MIN : INT_MAX))
2763             break;
2764         limit= sorted[out_i++]->poc - dir;
2765     }
2766     return out_i;
2767 }
2768
2769 /**
2770  * fills the default_ref_list.
2771  */
2772 static int fill_default_ref_list(H264Context *h){
2773     MpegEncContext * const s = &h->s;
2774     int i, len;
2775
2776     if(h->slice_type_nos==FF_B_TYPE){
2777         Picture *sorted[32];
2778         int cur_poc, list;
2779         int lens[2];
2780
2781         if(FIELD_PICTURE)
2782             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2783         else
2784             cur_poc= s->current_picture_ptr->poc;
2785
2786         for(list= 0; list<2; list++){
2787             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2788             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2789             assert(len<=32);
2790             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2791             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2792             assert(len<=32);
2793
2794             if(len < h->ref_count[list])
2795                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2796             lens[list]= len;
2797         }
2798
2799         if(lens[0] == lens[1] && lens[1] > 1){
2800             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2801             if(i == lens[0])
2802                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2803         }
2804     }else{
2805         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2806         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2807         assert(len <= 32);
2808         if(len < h->ref_count[0])
2809             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2810     }
2811 #ifdef TRACE
2812     for (i=0; i<h->ref_count[0]; i++) {
2813         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2814     }
2815     if(h->slice_type_nos==FF_B_TYPE){
2816         for (i=0; i<h->ref_count[1]; i++) {
2817             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2818         }
2819     }
2820 #endif
2821     return 0;
2822 }
2823
2824 static void print_short_term(H264Context *h);
2825 static void print_long_term(H264Context *h);
2826
2827 /**
2828  * Extract structure information about the picture described by pic_num in
2829  * the current decoding context (frame or field). Note that pic_num is
2830  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2831  * @param pic_num picture number for which to extract structure information
2832  * @param structure one of PICT_XXX describing structure of picture
2833  *                      with pic_num
2834  * @return frame number (short term) or long term index of picture
2835  *         described by pic_num
2836  */
2837 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2838     MpegEncContext * const s = &h->s;
2839
2840     *structure = s->picture_structure;
2841     if(FIELD_PICTURE){
2842         if (!(pic_num & 1))
2843             /* opposite field */
2844             *structure ^= PICT_FRAME;
2845         pic_num >>= 1;
2846     }
2847
2848     return pic_num;
2849 }
2850
2851 static int decode_ref_pic_list_reordering(H264Context *h){
2852     MpegEncContext * const s = &h->s;
2853     int list, index, pic_structure;
2854
2855     print_short_term(h);
2856     print_long_term(h);
2857
2858     for(list=0; list<h->list_count; list++){
2859         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2860
2861         if(get_bits1(&s->gb)){
2862             int pred= h->curr_pic_num;
2863
2864             for(index=0; ; index++){
2865                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2866                 unsigned int pic_id;
2867                 int i;
2868                 Picture *ref = NULL;
2869
2870                 if(reordering_of_pic_nums_idc==3)
2871                     break;
2872
2873                 if(index >= h->ref_count[list]){
2874                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2875                     return -1;
2876                 }
2877
2878                 if(reordering_of_pic_nums_idc<3){
2879                     if(reordering_of_pic_nums_idc<2){
2880                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2881                         int frame_num;
2882
2883                         if(abs_diff_pic_num > h->max_pic_num){
2884                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2885                             return -1;
2886                         }
2887
2888                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2889                         else                                pred+= abs_diff_pic_num;
2890                         pred &= h->max_pic_num - 1;
2891
2892                         frame_num = pic_num_extract(h, pred, &pic_structure);
2893
2894                         for(i= h->short_ref_count-1; i>=0; i--){
2895                             ref = h->short_ref[i];
2896                             assert(ref->reference);
2897                             assert(!ref->long_ref);
2898                             if(
2899                                    ref->frame_num == frame_num &&
2900                                    (ref->reference & pic_structure)
2901                               )
2902                                 break;
2903                         }
2904                         if(i>=0)
2905                             ref->pic_id= pred;
2906                     }else{
2907                         int long_idx;
2908                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2909
2910                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2911
2912                         if(long_idx>31){
2913                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2914                             return -1;
2915                         }
2916                         ref = h->long_ref[long_idx];
2917                         assert(!(ref && !ref->reference));
2918                         if(ref && (ref->reference & pic_structure)){
2919                             ref->pic_id= pic_id;
2920                             assert(ref->long_ref);
2921                             i=0;
2922                         }else{
2923                             i=-1;
2924                         }
2925                     }
2926
2927                     if (i < 0) {
2928                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2929                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2930                     } else {
2931                         for(i=index; i+1<h->ref_count[list]; i++){
2932                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2933                                 break;
2934                         }
2935                         for(; i > index; i--){
2936                             h->ref_list[list][i]= h->ref_list[list][i-1];
2937                         }
2938                         h->ref_list[list][index]= *ref;
2939                         if (FIELD_PICTURE){
2940                             pic_as_field(&h->ref_list[list][index], pic_structure);
2941                         }
2942                     }
2943                 }else{
2944                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2945                     return -1;
2946                 }
2947             }
2948         }
2949     }
2950     for(list=0; list<h->list_count; list++){
2951         for(index= 0; index < h->ref_count[list]; index++){
2952             if(!h->ref_list[list][index].data[0]){
2953                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2954                 if(h->default_ref_list[list][0].data[0])
2955                     h->ref_list[list][index]= h->default_ref_list[list][0];
2956                 else
2957                     return -1;
2958             }
2959         }
2960     }
2961
2962     return 0;
2963 }
2964
2965 static void fill_mbaff_ref_list(H264Context *h){
2966     int list, i, j;
2967     for(list=0; list<2; list++){ //FIXME try list_count
2968         for(i=0; i<h->ref_count[list]; i++){
2969             Picture *frame = &h->ref_list[list][i];
2970             Picture *field = &h->ref_list[list][16+2*i];
2971             field[0] = *frame;
2972             for(j=0; j<3; j++)
2973                 field[0].linesize[j] <<= 1;
2974             field[0].reference = PICT_TOP_FIELD;
2975             field[0].poc= field[0].field_poc[0];
2976             field[1] = field[0];
2977             for(j=0; j<3; j++)
2978                 field[1].data[j] += frame->linesize[j];
2979             field[1].reference = PICT_BOTTOM_FIELD;
2980             field[1].poc= field[1].field_poc[1];
2981
2982             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2983             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2984             for(j=0; j<2; j++){
2985                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2986                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2987             }
2988         }
2989     }
2990     for(j=0; j<h->ref_count[1]; j++){
2991         for(i=0; i<h->ref_count[0]; i++)
2992             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2993         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2994         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2995     }
2996 }
2997
2998 static int pred_weight_table(H264Context *h){
2999     MpegEncContext * const s = &h->s;
3000     int list, i;
3001     int luma_def, chroma_def;
3002
3003     h->use_weight= 0;
3004     h->use_weight_chroma= 0;
3005     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3006     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3007     luma_def = 1<<h->luma_log2_weight_denom;
3008     chroma_def = 1<<h->chroma_log2_weight_denom;
3009
3010     for(list=0; list<2; list++){
3011         h->luma_weight_flag[list]   = 0;
3012         h->chroma_weight_flag[list] = 0;
3013         for(i=0; i<h->ref_count[list]; i++){
3014             int luma_weight_flag, chroma_weight_flag;
3015
3016             luma_weight_flag= get_bits1(&s->gb);
3017             if(luma_weight_flag){
3018                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3019                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3020                 if(   h->luma_weight[list][i] != luma_def
3021                    || h->luma_offset[list][i] != 0) {
3022                     h->use_weight= 1;
3023                     h->luma_weight_flag[list]= 1;
3024                 }
3025             }else{
3026                 h->luma_weight[list][i]= luma_def;
3027                 h->luma_offset[list][i]= 0;
3028             }
3029
3030             if(CHROMA){
3031                 chroma_weight_flag= get_bits1(&s->gb);
3032                 if(chroma_weight_flag){
3033                     int j;
3034                     for(j=0; j<2; j++){
3035                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3036                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3037                         if(   h->chroma_weight[list][i][j] != chroma_def
3038                            || h->chroma_offset[list][i][j] != 0) {
3039                             h->use_weight_chroma= 1;
3040                             h->chroma_weight_flag[list]= 1;
3041                         }
3042                     }
3043                 }else{
3044                     int j;
3045                     for(j=0; j<2; j++){
3046                         h->chroma_weight[list][i][j]= chroma_def;
3047                         h->chroma_offset[list][i][j]= 0;
3048                     }
3049                 }
3050             }
3051         }
3052         if(h->slice_type_nos != FF_B_TYPE) break;
3053     }
3054     h->use_weight= h->use_weight || h->use_weight_chroma;
3055     return 0;
3056 }
3057
3058 static void implicit_weight_table(H264Context *h){
3059     MpegEncContext * const s = &h->s;
3060     int ref0, ref1, i;
3061     int cur_poc = s->current_picture_ptr->poc;
3062
3063     for (i = 0; i < 2; i++) {
3064         h->luma_weight_flag[i]   = 0;
3065         h->chroma_weight_flag[i] = 0;
3066     }
3067
3068     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3069        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3070         h->use_weight= 0;
3071         h->use_weight_chroma= 0;
3072         return;
3073     }
3074
3075     h->use_weight= 2;
3076     h->use_weight_chroma= 2;
3077     h->luma_log2_weight_denom= 5;
3078     h->chroma_log2_weight_denom= 5;
3079
3080     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3081         int poc0 = h->ref_list[0][ref0].poc;
3082         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3083             int poc1 = h->ref_list[1][ref1].poc;
3084             int td = av_clip(poc1 - poc0, -128, 127);
3085             if(td){
3086                 int tb = av_clip(cur_poc - poc0, -128, 127);
3087                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3088                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3089                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3090                     h->implicit_weight[ref0][ref1] = 32;
3091                 else
3092                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3093             }else
3094                 h->implicit_weight[ref0][ref1] = 32;
3095         }
3096     }
3097 }
3098
3099 /**
3100  * Mark a picture as no longer needed for reference. The refmask
3101  * argument allows unreferencing of individual fields or the whole frame.
3102  * If the picture becomes entirely unreferenced, but is being held for
3103  * display purposes, it is marked as such.
3104  * @param refmask mask of fields to unreference; the mask is bitwise
3105  *                anded with the reference marking of pic
3106  * @return non-zero if pic becomes entirely unreferenced (except possibly
3107  *         for display purposes) zero if one of the fields remains in
3108  *         reference
3109  */
3110 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3111     int i;
3112     if (pic->reference &= refmask) {
3113         return 0;
3114     } else {
3115         for(i = 0; h->delayed_pic[i]; i++)
3116             if(pic == h->delayed_pic[i]){
3117                 pic->reference=DELAYED_PIC_REF;
3118                 break;
3119             }
3120         return 1;
3121     }
3122 }
3123
3124 /**
3125  * instantaneous decoder refresh.
3126  */
3127 static void idr(H264Context *h){
3128     int i;
3129
3130     for(i=0; i<16; i++){
3131         remove_long(h, i, 0);
3132     }
3133     assert(h->long_ref_count==0);
3134
3135     for(i=0; i<h->short_ref_count; i++){
3136         unreference_pic(h, h->short_ref[i], 0);
3137         h->short_ref[i]= NULL;
3138     }
3139     h->short_ref_count=0;
3140     h->prev_frame_num= 0;
3141     h->prev_frame_num_offset= 0;
3142     h->prev_poc_msb=
3143     h->prev_poc_lsb= 0;
3144 }
3145
3146 /* forget old pics after a seek */
3147 static void flush_dpb(AVCodecContext *avctx){
3148     H264Context *h= avctx->priv_data;
3149     int i;
3150     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3151         if(h->delayed_pic[i])
3152             h->delayed_pic[i]->reference= 0;
3153         h->delayed_pic[i]= NULL;
3154     }
3155     h->outputed_poc= INT_MIN;
3156     idr(h);
3157     if(h->s.current_picture_ptr)
3158         h->s.current_picture_ptr->reference= 0;
3159     h->s.first_field= 0;
3160     reset_sei(h);
3161     ff_mpeg_flush(avctx);
3162 }
3163
3164 /**
3165  * Find a Picture in the short term reference list by frame number.
3166  * @param frame_num frame number to search for
3167  * @param idx the index into h->short_ref where returned picture is found
3168  *            undefined if no picture found.
3169  * @return pointer to the found picture, or NULL if no pic with the provided
3170  *                 frame number is found
3171  */
3172 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3173     MpegEncContext * const s = &h->s;
3174     int i;
3175
3176     for(i=0; i<h->short_ref_count; i++){
3177         Picture *pic= h->short_ref[i];
3178         if(s->avctx->debug&FF_DEBUG_MMCO)
3179             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3180         if(pic->frame_num == frame_num) {
3181             *idx = i;
3182             return pic;
3183         }
3184     }
3185     return NULL;
3186 }
3187
3188 /**
3189  * Remove a picture from the short term reference list by its index in
3190  * that list.  This does no checking on the provided index; it is assumed
3191  * to be valid. Other list entries are shifted down.
3192  * @param i index into h->short_ref of picture to remove.
3193  */
3194 static void remove_short_at_index(H264Context *h, int i){
3195     assert(i >= 0 && i < h->short_ref_count);
3196     h->short_ref[i]= NULL;
3197     if (--h->short_ref_count)
3198         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3199 }
3200
3201 /**
3202  *
3203  * @return the removed picture or NULL if an error occurs
3204  */
3205 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3206     MpegEncContext * const s = &h->s;
3207     Picture *pic;
3208     int i;
3209
3210     if(s->avctx->debug&FF_DEBUG_MMCO)
3211         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3212
3213     pic = find_short(h, frame_num, &i);
3214     if (pic){
3215         if(unreference_pic(h, pic, ref_mask))
3216         remove_short_at_index(h, i);
3217     }
3218
3219     return pic;
3220 }
3221
3222 /**
3223  * Remove a picture from the long term reference list by its index in
3224  * that list.
3225  * @return the removed picture or NULL if an error occurs
3226  */
3227 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3228     Picture *pic;
3229
3230     pic= h->long_ref[i];
3231     if (pic){
3232         if(unreference_pic(h, pic, ref_mask)){
3233             assert(h->long_ref[i]->long_ref == 1);
3234             h->long_ref[i]->long_ref= 0;
3235             h->long_ref[i]= NULL;
3236             h->long_ref_count--;
3237         }
3238     }
3239
3240     return pic;
3241 }
3242
3243 /**
3244  * print short term list
3245  */
3246 static void print_short_term(H264Context *h) {
3247     uint32_t i;
3248     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3249         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3250         for(i=0; i<h->short_ref_count; i++){
3251             Picture *pic= h->short_ref[i];
3252             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3253         }
3254     }
3255 }
3256
3257 /**
3258  * print long term list
3259  */
3260 static void print_long_term(H264Context *h) {
3261     uint32_t i;
3262     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3263         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3264         for(i = 0; i < 16; i++){
3265             Picture *pic= h->long_ref[i];
3266             if (pic) {
3267                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3268             }
3269         }
3270     }
3271 }
3272
3273 /**
3274  * Executes the reference picture marking (memory management control operations).
3275  */
3276 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3277     MpegEncContext * const s = &h->s;
3278     int i, av_uninit(j);
3279     int current_ref_assigned=0;
3280     Picture *av_uninit(pic);
3281
3282     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3283         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3284
3285     for(i=0; i<mmco_count; i++){
3286         int av_uninit(structure), av_uninit(frame_num);
3287         if(s->avctx->debug&FF_DEBUG_MMCO)
3288             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3289
3290         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3291            || mmco[i].opcode == MMCO_SHORT2LONG){
3292             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3293             pic = find_short(h, frame_num, &j);
3294             if(!pic){
3295                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3296                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3297                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3298                 continue;
3299             }
3300         }
3301
3302         switch(mmco[i].opcode){
3303         case MMCO_SHORT2UNUSED:
3304             if(s->avctx->debug&FF_DEBUG_MMCO)
3305                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3306             remove_short(h, frame_num, structure ^ PICT_FRAME);
3307             break;
3308         case MMCO_SHORT2LONG:
3309                 if (h->long_ref[mmco[i].long_arg] != pic)
3310                     remove_long(h, mmco[i].long_arg, 0);
3311
3312                 remove_short_at_index(h, j);
3313                 h->long_ref[ mmco[i].long_arg ]= pic;
3314                 if (h->long_ref[ mmco[i].long_arg ]){
3315                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3316                     h->long_ref_count++;
3317                 }
3318             break;
3319         case MMCO_LONG2UNUSED:
3320             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3321             pic = h->long_ref[j];
3322             if (pic) {
3323                 remove_long(h, j, structure ^ PICT_FRAME);
3324             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3325                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3326             break;
3327         case MMCO_LONG:
3328                     // Comment below left from previous code as it is an interresting note.
3329                     /* First field in pair is in short term list or
3330                      * at a different long term index.
3331                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3332                      * Report the problem and keep the pair where it is,
3333                      * and mark this field valid.
3334                      */
3335
3336             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3337                 remove_long(h, mmco[i].long_arg, 0);
3338
3339                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3340                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3341                 h->long_ref_count++;
3342             }
3343
3344             s->current_picture_ptr->reference |= s->picture_structure;
3345             current_ref_assigned=1;
3346             break;
3347         case MMCO_SET_MAX_LONG:
3348             assert(mmco[i].long_arg <= 16);
3349             // just remove the long term which index is greater than new max
3350             for(j = mmco[i].long_arg; j<16; j++){
3351                 remove_long(h, j, 0);
3352             }
3353             break;
3354         case MMCO_RESET:
3355             while(h->short_ref_count){
3356                 remove_short(h, h->short_ref[0]->frame_num, 0);
3357             }
3358             for(j = 0; j < 16; j++) {
3359                 remove_long(h, j, 0);
3360             }
3361             s->current_picture_ptr->poc=
3362             s->current_picture_ptr->field_poc[0]=
3363             s->current_picture_ptr->field_poc[1]=
3364             h->poc_lsb=
3365             h->poc_msb=
3366             h->frame_num=
3367             s->current_picture_ptr->frame_num= 0;
3368             break;
3369         default: assert(0);
3370         }
3371     }
3372
3373     if (!current_ref_assigned) {
3374         /* Second field of complementary field pair; the first field of
3375          * which is already referenced. If short referenced, it
3376          * should be first entry in short_ref. If not, it must exist
3377          * in long_ref; trying to put it on the short list here is an
3378          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3379          */
3380         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3381             /* Just mark the second field valid */
3382             s->current_picture_ptr->reference = PICT_FRAME;
3383         } else if (s->current_picture_ptr->long_ref) {
3384             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3385                                              "assignment for second field "
3386                                              "in complementary field pair "
3387                                              "(first field is long term)\n");
3388         } else {
3389             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3390             if(pic){
3391                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3392             }
3393
3394             if(h->short_ref_count)
3395                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3396
3397             h->short_ref[0]= s->current_picture_ptr;
3398             h->short_ref_count++;
3399             s->current_picture_ptr->reference |= s->picture_structure;
3400         }
3401     }
3402
3403     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3404
3405         /* We have too many reference frames, probably due to corrupted
3406          * stream. Need to discard one frame. Prevents overrun of the
3407          * short_ref and long_ref buffers.
3408          */
3409         av_log(h->s.avctx, AV_LOG_ERROR,
3410                "number of reference frames exceeds max (probably "
3411                "corrupt input), discarding one\n");
3412
3413         if (h->long_ref_count && !h->short_ref_count) {
3414             for (i = 0; i < 16; ++i)
3415                 if (h->long_ref[i])
3416                     break;
3417
3418             assert(i < 16);
3419             remove_long(h, i, 0);
3420         } else {
3421             pic = h->short_ref[h->short_ref_count - 1];
3422             remove_short(h, pic->frame_num, 0);
3423         }
3424     }
3425
3426     print_short_term(h);
3427     print_long_term(h);
3428     return 0;
3429 }
3430
3431 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3432     MpegEncContext * const s = &h->s;
3433     int i;
3434
3435     h->mmco_index= 0;
3436     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3437         s->broken_link= get_bits1(gb) -1;
3438         if(get_bits1(gb)){
3439             h->mmco[0].opcode= MMCO_LONG;
3440             h->mmco[0].long_arg= 0;
3441             h->mmco_index= 1;
3442         }
3443     }else{
3444         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3445             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3446                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3447
3448                 h->mmco[i].opcode= opcode;
3449                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3450                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3451 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3452                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3453                         return -1;
3454                     }*/
3455                 }
3456                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3457                     unsigned int long_arg= get_ue_golomb_31(gb);
3458                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3459                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3460                         return -1;
3461                     }
3462                     h->mmco[i].long_arg= long_arg;
3463                 }
3464
3465                 if(opcode > (unsigned)MMCO_LONG){
3466                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3467                     return -1;
3468                 }
3469                 if(opcode == MMCO_END)
3470                     break;
3471             }
3472             h->mmco_index= i;
3473         }else{
3474             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3475
3476             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3477                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3478                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3479                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3480                 h->mmco_index= 1;
3481                 if (FIELD_PICTURE) {
3482                     h->mmco[0].short_pic_num *= 2;
3483                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3484                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3485                     h->mmco_index= 2;
3486                 }
3487             }
3488         }
3489     }
3490
3491     return 0;
3492 }
3493
3494 static int init_poc(H264Context *h){
3495     MpegEncContext * const s = &h->s;
3496     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3497     int field_poc[2];
3498     Picture *cur = s->current_picture_ptr;
3499
3500     h->frame_num_offset= h->prev_frame_num_offset;
3501     if(h->frame_num < h->prev_frame_num)
3502         h->frame_num_offset += max_frame_num;
3503
3504     if(h->sps.poc_type==0){
3505         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3506
3507         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3508             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3509         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3510             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3511         else
3512             h->poc_msb = h->prev_poc_msb;
3513 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3514         field_poc[0] =
3515         field_poc[1] = h->poc_msb + h->poc_lsb;
3516         if(s->picture_structure == PICT_FRAME)
3517             field_poc[1] += h->delta_poc_bottom;
3518     }else if(h->sps.poc_type==1){
3519         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3520         int i;
3521
3522         if(h->sps.poc_cycle_length != 0)
3523             abs_frame_num = h->frame_num_offset + h->frame_num;
3524         else
3525             abs_frame_num = 0;
3526
3527         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3528             abs_frame_num--;
3529
3530         expected_delta_per_poc_cycle = 0;
3531         for(i=0; i < h->sps.poc_cycle_length; i++)
3532             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3533
3534         if(abs_frame_num > 0){
3535             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3536             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3537
3538             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3539             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3540                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3541         } else
3542             expectedpoc = 0;
3543
3544         if(h->nal_ref_idc == 0)
3545             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3546
3547         field_poc[0] = expectedpoc + h->delta_poc[0];
3548         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3549
3550         if(s->picture_structure == PICT_FRAME)
3551             field_poc[1] += h->delta_poc[1];
3552     }else{
3553         int poc= 2*(h->frame_num_offset + h->frame_num);
3554
3555         if(!h->nal_ref_idc)
3556             poc--;
3557
3558         field_poc[0]= poc;
3559         field_poc[1]= poc;
3560     }
3561
3562     if(s->picture_structure != PICT_BOTTOM_FIELD)
3563         s->current_picture_ptr->field_poc[0]= field_poc[0];
3564     if(s->picture_structure != PICT_TOP_FIELD)
3565         s->current_picture_ptr->field_poc[1]= field_poc[1];
3566     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3567
3568     return 0;
3569 }
3570
3571
3572 /**
3573  * initialize scan tables
3574  */
3575 static void init_scan_tables(H264Context *h){
3576     MpegEncContext * const s = &h->s;
3577     int i;
3578     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3579         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3580         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3581     }else{
3582         for(i=0; i<16; i++){
3583 #define T(x) (x>>2) | ((x<<2) & 0xF)
3584             h->zigzag_scan[i] = T(zigzag_scan[i]);
3585             h-> field_scan[i] = T( field_scan[i]);
3586 #undef T
3587         }
3588     }
3589     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3590         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3591         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3592         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3593         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3594     }else{
3595         for(i=0; i<64; i++){
3596 #define T(x) (x>>3) | ((x&7)<<3)
3597             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3598             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3599             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3600             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3601 #undef T
3602         }
3603     }
3604     if(h->sps.transform_bypass){ //FIXME same ugly
3605         h->zigzag_scan_q0          = zigzag_scan;
3606         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3607         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3608         h->field_scan_q0           = field_scan;
3609         h->field_scan8x8_q0        = field_scan8x8;
3610         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3611     }else{
3612         h->zigzag_scan_q0          = h->zigzag_scan;
3613         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3614         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3615         h->field_scan_q0           = h->field_scan;
3616         h->field_scan8x8_q0        = h->field_scan8x8;
3617         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3618     }
3619 }
3620
3621 static void field_end(H264Context *h){
3622     MpegEncContext * const s = &h->s;
3623     AVCodecContext * const avctx= s->avctx;
3624     s->mb_y= 0;
3625
3626     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
3627     s->current_picture_ptr->pict_type= s->pict_type;
3628
3629     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3630         ff_vdpau_h264_set_reference_frames(s);
3631
3632     if(!s->dropable) {
3633         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
3634         h->prev_poc_msb= h->poc_msb;
3635         h->prev_poc_lsb= h->poc_lsb;
3636     }
3637     h->prev_frame_num_offset= h->frame_num_offset;
3638     h->prev_frame_num= h->frame_num;
3639
3640     if (avctx->hwaccel) {
3641         if (avctx->hwaccel->end_frame(avctx) < 0)
3642             av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
3643     }
3644
3645     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3646         ff_vdpau_h264_picture_complete(s);
3647
3648     /*
3649      * FIXME: Error handling code does not seem to support interlaced
3650      * when slices span multiple rows
3651      * The ff_er_add_slice calls don't work right for bottom
3652      * fields; they cause massive erroneous error concealing
3653      * Error marking covers both fields (top and bottom).
3654      * This causes a mismatched s->error_count
3655      * and a bad error table. Further, the error count goes to
3656      * INT_MAX when called for bottom field, because mb_y is
3657      * past end by one (callers fault) and resync_mb_y != 0
3658      * causes problems for the first MB line, too.
3659      */
3660     if (!FIELD_PICTURE)
3661         ff_er_frame_end(s);
3662
3663     MPV_frame_end(s);
3664
3665     h->current_slice=0;
3666 }
3667
3668 /**
3669  * Replicates H264 "master" context to thread contexts.
3670  */
3671 static void clone_slice(H264Context *dst, H264Context *src)
3672 {
3673     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3674     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3675     dst->s.current_picture      = src->s.current_picture;
3676     dst->s.linesize             = src->s.linesize;
3677     dst->s.uvlinesize           = src->s.uvlinesize;
3678     dst->s.first_field          = src->s.first_field;
3679
3680     dst->prev_poc_msb           = src->prev_poc_msb;
3681     dst->prev_poc_lsb           = src->prev_poc_lsb;
3682     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3683     dst->prev_frame_num         = src->prev_frame_num;
3684     dst->short_ref_count        = src->short_ref_count;
3685
3686     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3687     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3688     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3689     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3690
3691     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3692     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3693 }
3694
3695 /**
3696  * decodes a slice header.
3697  * This will also call MPV_common_init() and frame_start() as needed.
3698  *
3699  * @param h h264context
3700  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3701  *
3702  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3703  */
3704 static int decode_slice_header(H264Context *h, H264Context *h0){
3705     MpegEncContext * const s = &h->s;
3706     MpegEncContext * const s0 = &h0->s;
3707     unsigned int first_mb_in_slice;
3708     unsigned int pps_id;
3709     int num_ref_idx_active_override_flag;
3710     unsigned int slice_type, tmp, i, j;
3711     int default_ref_list_done = 0;
3712     int last_pic_structure;
3713
3714     s->dropable= h->nal_ref_idc == 0;
3715
3716     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3717         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3718         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3719     }else{
3720         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3721         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3722     }
3723
3724     first_mb_in_slice= get_ue_golomb(&s->gb);
3725
3726     if(first_mb_in_slice == 0){ //FIXME better field boundary detection
3727         if(h0->current_slice && FIELD_PICTURE){
3728             field_end(h);
3729         }
3730
3731         h0->current_slice = 0;
3732         if (!s0->first_field)
3733             s->current_picture_ptr= NULL;
3734     }
3735
3736     slice_type= get_ue_golomb_31(&s->gb);
3737     if(slice_type > 9){
3738         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3739         return -1;
3740     }
3741     if(slice_type > 4){
3742         slice_type -= 5;
3743         h->slice_type_fixed=1;
3744     }else
3745         h->slice_type_fixed=0;
3746
3747     slice_type= golomb_to_pict_type[ slice_type ];
3748     if (slice_type == FF_I_TYPE
3749         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3750         default_ref_list_done = 1;
3751     }
3752     h->slice_type= slice_type;
3753     h->slice_type_nos= slice_type & 3;
3754
3755     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3756     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3757         av_log(h->s.avctx, AV_LOG_ERROR,
3758                "B picture before any references, skipping\n");
3759         return -1;
3760     }
3761
3762     pps_id= get_ue_golomb(&s->gb);
3763     if(pps_id>=MAX_PPS_COUNT){
3764         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3765         return -1;
3766     }
3767     if(!h0->pps_buffers[pps_id]) {
3768         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
3769         return -1;
3770     }
3771     h->pps= *h0->pps_buffers[pps_id];
3772
3773     if(!h0->sps_buffers[h->pps.sps_id]) {
3774         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
3775         return -1;
3776     }
3777     h->sps = *h0->sps_buffers[h->pps.sps_id];
3778
3779     if(h == h0 && h->dequant_coeff_pps != pps_id){
3780         h->dequant_coeff_pps = pps_id;
3781         init_dequant_tables(h);
3782     }
3783
3784     s->mb_width= h->sps.mb_width;
3785     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3786
3787     h->b_stride=  s->mb_width*4;
3788     h->b8_stride= s->mb_width*2;
3789
3790     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3791     if(h->sps.frame_mbs_only_flag)
3792         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3793     else
3794         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3795
3796     if (s->context_initialized
3797         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3798         if(h != h0)
3799             return -1;   // width / height changed during parallelized decoding
3800         free_tables(h);
3801         flush_dpb(s->avctx);
3802         MPV_common_end(s);
3803     }
3804     if (!s->context_initialized) {
3805         if(h != h0)
3806             return -1;  // we cant (re-)initialize context during parallel decoding
3807         if (MPV_common_init(s) < 0)
3808             return -1;
3809         s->first_field = 0;
3810
3811         init_scan_tables(h);
3812         alloc_tables(h);
3813
3814         for(i = 1; i < s->avctx->thread_count; i++) {
3815             H264Context *c;
3816             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3817             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3818             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3819             c->sps = h->sps;
3820             c->pps = h->pps;
3821             init_scan_tables(c);
3822             clone_tables(c, h);
3823         }
3824
3825         for(i = 0; i < s->avctx->thread_count; i++)
3826             if(context_init(h->thread_context[i]) < 0)
3827                 return -1;
3828
3829         s->avctx->width = s->width;
3830         s->avctx->height = s->height;
3831         s->avctx->sample_aspect_ratio= h->sps.sar;
3832         if(!s->avctx->sample_aspect_ratio.den)
3833             s->avctx->sample_aspect_ratio.den = 1;
3834
3835         if(h->sps.timing_info_present_flag){
3836             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3837             if(h->x264_build > 0 && h->x264_build < 44)
3838                 s->avctx->time_base.den *= 2;
3839             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3840                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3841         }
3842     }
3843
3844     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3845
3846     h->mb_mbaff = 0;
3847     h->mb_aff_frame = 0;
3848     last_pic_structure = s0->picture_structure;
3849     if(h->sps.frame_mbs_only_flag){
3850         s->picture_structure= PICT_FRAME;
3851     }else{
3852         if(get_bits1(&s->gb)) { //field_pic_flag
3853             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3854         } else {
3855             s->picture_structure= PICT_FRAME;
3856             h->mb_aff_frame = h->sps.mb_aff;
3857         }
3858     }
3859     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3860
3861     if(h0->current_slice == 0){
3862         while(h->frame_num !=  h->prev_frame_num &&
3863               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3864             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3865             if (frame_start(h) < 0)
3866                 return -1;
3867             h->prev_frame_num++;
3868             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3869             s->current_picture_ptr->frame_num= h->prev_frame_num;
3870             execute_ref_pic_marking(h, NULL, 0);
3871         }
3872
3873         /* See if we have a decoded first field looking for a pair... */
3874         if (s0->first_field) {
3875             assert(s0->current_picture_ptr);
3876             assert(s0->current_picture_ptr->data[0]);
3877             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3878
3879             /* figure out if we have a complementary field pair */
3880             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3881                 /*
3882                  * Previous field is unmatched. Don't display it, but let it
3883                  * remain for reference if marked as such.
3884                  */
3885                 s0->current_picture_ptr = NULL;
3886                 s0->first_field = FIELD_PICTURE;
3887
3888             } else {
3889                 if (h->nal_ref_idc &&
3890                         s0->current_picture_ptr->reference &&
3891                         s0->current_picture_ptr->frame_num != h->frame_num) {
3892                     /*
3893                      * This and previous field were reference, but had
3894                      * different frame_nums. Consider this field first in
3895                      * pair. Throw away previous field except for reference
3896                      * purposes.
3897                      */
3898                     s0->first_field = 1;
3899                     s0->current_picture_ptr = NULL;
3900
3901                 } else {
3902                     /* Second field in complementary pair */
3903                     s0->first_field = 0;
3904                 }
3905             }
3906
3907         } else {
3908             /* Frame or first field in a potentially complementary pair */
3909             assert(!s0->current_picture_ptr);
3910             s0->first_field = FIELD_PICTURE;
3911         }
3912
3913         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3914             s0->first_field = 0;
3915             return -1;
3916         }
3917     }
3918     if(h != h0)
3919         clone_slice(h, h0);
3920
3921     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3922
3923     assert(s->mb_num == s->mb_width * s->mb_height);
3924     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3925        first_mb_in_slice                    >= s->mb_num){
3926         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3927         return -1;
3928     }
3929     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3930     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3931     if (s->picture_structure == PICT_BOTTOM_FIELD)
3932         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3933     assert(s->mb_y < s->mb_height);
3934
3935     if(s->picture_structure==PICT_FRAME){
3936         h->curr_pic_num=   h->frame_num;
3937         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3938     }else{
3939         h->curr_pic_num= 2*h->frame_num + 1;
3940         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3941     }
3942
3943     if(h->nal_unit_type == NAL_IDR_SLICE){
3944         get_ue_golomb(&s->gb); /* idr_pic_id */
3945     }
3946
3947     if(h->sps.poc_type==0){
3948         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3949
3950         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3951             h->delta_poc_bottom= get_se_golomb(&s->gb);
3952         }
3953     }
3954
3955     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3956         h->delta_poc[0]= get_se_golomb(&s->gb);
3957
3958         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3959             h->delta_poc[1]= get_se_golomb(&s->gb);
3960     }
3961
3962     init_poc(h);
3963
3964     if(h->pps.redundant_pic_cnt_present){
3965         h->redundant_pic_count= get_ue_golomb(&s->gb);
3966     }
3967
3968     //set defaults, might be overridden a few lines later
3969     h->ref_count[0]= h->pps.ref_count[0];
3970     h->ref_count[1]= h->pps.ref_count[1];
3971
3972     if(h->slice_type_nos != FF_I_TYPE){
3973         if(h->slice_type_nos == FF_B_TYPE){
3974             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3975         }
3976         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3977
3978         if(num_ref_idx_active_override_flag){
3979             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3980             if(h->slice_type_nos==FF_B_TYPE)
3981                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3982
3983             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3984                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3985                 h->ref_count[0]= h->ref_count[1]= 1;
3986                 return -1;
3987             }
3988         }
3989         if(h->slice_type_nos == FF_B_TYPE)
3990             h->list_count= 2;
3991         else
3992             h->list_count= 1;
3993     }else
3994         h->list_count= 0;
3995
3996     if(!default_ref_list_done){
3997         fill_default_ref_list(h);
3998     }
3999
4000     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
4001         return -1;
4002
4003     if(h->slice_type_nos!=FF_I_TYPE){
4004         s->last_picture_ptr= &h->ref_list[0][0];
4005         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
4006     }
4007     if(h->slice_type_nos==FF_B_TYPE){
4008         s->next_picture_ptr= &h->ref_list[1][0];
4009         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
4010     }
4011
4012     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4013        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4014         pred_weight_table(h);
4015     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4016         implicit_weight_table(h);
4017     else {
4018         h->use_weight = 0;
4019         for (i = 0; i < 2; i++) {
4020             h->luma_weight_flag[i]   = 0;
4021             h->chroma_weight_flag[i] = 0;
4022         }
4023     }
4024
4025     if(h->nal_ref_idc)
4026         decode_ref_pic_marking(h0, &s->gb);
4027
4028     if(FRAME_MBAFF)
4029         fill_mbaff_ref_list(h);
4030
4031     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
4032         direct_dist_scale_factor(h);
4033     direct_ref_list_init(h);
4034
4035     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4036         tmp = get_ue_golomb_31(&s->gb);
4037         if(tmp > 2){
4038             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4039             return -1;
4040         }
4041         h->cabac_init_idc= tmp;
4042     }
4043
4044     h->last_qscale_diff = 0;
4045     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4046     if(tmp>51){
4047         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4048         return -1;
4049     }
4050     s->qscale= tmp;
4051     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4052     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4053     //FIXME qscale / qp ... stuff
4054     if(h->slice_type == FF_SP_TYPE){
4055         get_bits1(&s->gb); /* sp_for_switch_flag */
4056     }
4057     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4058         get_se_golomb(&s->gb); /* slice_qs_delta */
4059     }
4060
4061     h->deblocking_filter = 1;
4062     h->slice_alpha_c0_offset = 0;
4063     h->slice_beta_offset = 0;
4064     if( h->pps.deblocking_filter_parameters_present ) {
4065         tmp= get_ue_golomb_31(&s->gb);
4066         if(tmp > 2){
4067             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4068             return -1;
4069         }
4070         h->deblocking_filter= tmp;
4071         if(h->deblocking_filter < 2)
4072             h->deblocking_filter^= 1; // 1<->0
4073
4074         if( h->deblocking_filter ) {
4075             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4076             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4077         }
4078     }
4079
4080     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4081        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4082        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4083        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4084         h->deblocking_filter= 0;
4085
4086     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4087         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4088             /* Cheat slightly for speed:
4089                Do not bother to deblock across slices. */
4090             h->deblocking_filter = 2;
4091         } else {
4092             h0->max_contexts = 1;
4093             if(!h0->single_decode_warning) {
4094                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4095                 h0->single_decode_warning = 1;
4096             }
4097             if(h != h0)
4098                 return 1; // deblocking switched inside frame
4099         }
4100     }
4101
4102 #if 0 //FMO
4103     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4104         slice_group_change_cycle= get_bits(&s->gb, ?);
4105 #endif
4106
4107     h0->last_slice_type = slice_type;
4108     h->slice_num = ++h0->current_slice;
4109     if(h->slice_num >= MAX_SLICES){
4110         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4111     }
4112
4113     for(j=0; j<2; j++){
4114         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4115         ref2frm[0]=
4116         ref2frm[1]= -1;
4117         for(i=0; i<16; i++)
4118             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4119                           +(h->ref_list[j][i].reference&3);
4120         ref2frm[18+0]=
4121         ref2frm[18+1]= -1;
4122         for(i=16; i<48; i++)
4123             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4124                           +(h->ref_list[j][i].reference&3);
4125     }
4126
4127     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4128     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4129
4130     s->avctx->refs= h->sps.ref_frame_count;
4131
4132     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4133         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4134                h->slice_num,
4135                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4136                first_mb_in_slice,
4137                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4138                pps_id, h->frame_num,
4139                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4140                h->ref_count[0], h->ref_count[1],
4141                s->qscale,
4142                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4143                h->use_weight,
4144                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4145                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4146                );
4147     }
4148
4149     return 0;
4150 }
4151
4152 /**
4153  *
4154  */
4155 static inline int get_level_prefix(GetBitContext *gb){
4156     unsigned int buf;
4157     int log;
4158
4159     OPEN_READER(re, gb);
4160     UPDATE_CACHE(re, gb);
4161     buf=GET_CACHE(re, gb);
4162
4163     log= 32 - av_log2(buf);
4164 #ifdef TRACE
4165     print_bin(buf>>(32-log), log);
4166     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4167 #endif
4168
4169     LAST_SKIP_BITS(re, gb, log);
4170     CLOSE_READER(re, gb);
4171
4172     return log-1;
4173 }
4174
4175 static inline int get_dct8x8_allowed(H264Context *h){
4176     if(h->sps.direct_8x8_inference_flag)
4177         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4178     else
4179         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4180 }
4181
4182 /**
4183  * decodes a residual block.
4184  * @param n block index
4185  * @param scantable scantable
4186  * @param max_coeff number of coefficients in the block
4187  * @return <0 if an error occurred
4188  */
4189 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4190     MpegEncContext * const s = &h->s;
4191     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4192     int level[16];
4193     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4194
4195     //FIXME put trailing_onex into the context
4196
4197     if(n == CHROMA_DC_BLOCK_INDEX){
4198         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4199         total_coeff= coeff_token>>2;
4200     }else{
4201         if(n == LUMA_DC_BLOCK_INDEX){
4202             total_coeff= pred_non_zero_count(h, 0);
4203             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4204             total_coeff= coeff_token>>2;
4205         }else{
4206             total_coeff= pred_non_zero_count(h, n);
4207             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4208             total_coeff= coeff_token>>2;
4209             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4210         }
4211     }
4212
4213     //FIXME set last_non_zero?
4214
4215     if(total_coeff==0)
4216         return 0;
4217     if(total_coeff > (unsigned)max_coeff) {
4218         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4219         return -1;
4220     }
4221
4222     trailing_ones= coeff_token&3;
4223     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4224     assert(total_coeff<=16);
4225
4226     i = show_bits(gb, 3);
4227     skip_bits(gb, trailing_ones);
4228     level[0] = 1-((i&4)>>1);
4229     level[1] = 1-((i&2)   );
4230     level[2] = 1-((i&1)<<1);
4231
4232     if(trailing_ones<total_coeff) {
4233         int mask, prefix;
4234         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4235         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4236         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4237
4238         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4239         if(level_code >= 100){
4240             prefix= level_code - 100;
4241             if(prefix == LEVEL_TAB_BITS)
4242                 prefix += get_level_prefix(gb);
4243
4244             //first coefficient has suffix_length equal to 0 or 1
4245             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4246                 if(suffix_length)
4247                     level_code= (prefix<<1) + get_bits1(gb); //part
4248                 else
4249                     level_code= prefix; //part
4250             }else if(prefix==14){
4251                 if(suffix_length)
4252                     level_code= (prefix<<1) + get_bits1(gb); //part
4253                 else
4254                     level_code= prefix + get_bits(gb, 4); //part
4255             }else{
4256                 level_code= 30 + get_bits(gb, prefix-3); //part
4257                 if(prefix>=16)
4258                     level_code += (1<<(prefix-3))-4096;
4259             }
4260
4261             if(trailing_ones < 3) level_code += 2;
4262
4263             suffix_length = 2;
4264             mask= -(level_code&1);
4265             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4266         }else{
4267             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4268
4269             suffix_length = 1;
4270             if(level_code + 3U > 6U)
4271                 suffix_length++;
4272             level[trailing_ones]= level_code;
4273         }
4274
4275         //remaining coefficients have suffix_length > 0
4276         for(i=trailing_ones+1;i<total_coeff;i++) {
4277             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4278             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4279             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4280
4281             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4282             if(level_code >= 100){
4283                 prefix= level_code - 100;
4284                 if(prefix == LEVEL_TAB_BITS){
4285                     prefix += get_level_prefix(gb);
4286                 }
4287                 if(prefix<15){
4288                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4289                 }else{
4290                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4291                     if(prefix>=16)
4292                         level_code += (1<<(prefix-3))-4096;
4293                 }
4294                 mask= -(level_code&1);
4295                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4296             }
4297             level[i]= level_code;
4298
4299             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4300                 suffix_length++;
4301         }
4302     }
4303
4304     if(total_coeff == max_coeff)
4305         zeros_left=0;
4306     else{
4307         if(n == CHROMA_DC_BLOCK_INDEX)
4308             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4309         else
4310             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4311     }
4312
4313     coeff_num = zeros_left + total_coeff - 1;
4314     j = scantable[coeff_num];
4315     if(n > 24){
4316         block[j] = level[0];
4317         for(i=1;i<total_coeff;i++) {
4318             if(zeros_left <= 0)
4319                 run_before = 0;
4320             else if(zeros_left < 7){
4321                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4322             }else{
4323                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4324             }
4325             zeros_left -= run_before;
4326             coeff_num -= 1 + run_before;
4327             j= scantable[ coeff_num ];
4328
4329             block[j]= level[i];
4330         }
4331     }else{
4332         block[j] = (level[0] * qmul[j] + 32)>>6;
4333         for(i=1;i<total_coeff;i++) {
4334             if(zeros_left <= 0)
4335                 run_before = 0;
4336             else if(zeros_left < 7){
4337                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4338             }else{
4339                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4340             }
4341             zeros_left -= run_before;
4342             coeff_num -= 1 + run_before;
4343             j= scantable[ coeff_num ];
4344
4345             block[j]= (level[i] * qmul[j] + 32)>>6;
4346         }
4347     }
4348
4349     if(zeros_left<0){
4350         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4351         return -1;
4352     }
4353
4354     return 0;
4355 }
4356
4357 static void predict_field_decoding_flag(H264Context *h){
4358     MpegEncContext * const s = &h->s;
4359     const int mb_xy= h->mb_xy;
4360     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4361                 ? s->current_picture.mb_type[mb_xy-1]
4362                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4363                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4364                 : 0;
4365     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4366 }
4367
4368 /**
4369  * decodes a P_SKIP or B_SKIP macroblock
4370  */
4371 static void decode_mb_skip(H264Context *h){
4372     MpegEncContext * const s = &h->s;
4373     const int mb_xy= h->mb_xy;
4374     int mb_type=0;
4375
4376     memset(h->non_zero_count[mb_xy], 0, 16);
4377     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4378
4379     if(MB_FIELD)
4380         mb_type|= MB_TYPE_INTERLACED;
4381
4382     if( h->slice_type_nos == FF_B_TYPE )
4383     {
4384         // just for fill_caches. pred_direct_motion will set the real mb_type
4385         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4386
4387         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4388         pred_direct_motion(h, &mb_type);
4389         mb_type|= MB_TYPE_SKIP;
4390     }
4391     else
4392     {
4393         int mx, my;
4394         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4395
4396         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4397         pred_pskip_motion(h, &mx, &my);
4398         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4399         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4400     }
4401
4402     write_back_motion(h, mb_type);
4403     s->current_picture.mb_type[mb_xy]= mb_type;
4404     s->current_picture.qscale_table[mb_xy]= s->qscale;
4405     h->slice_table[ mb_xy ]= h->slice_num;
4406     h->prev_mb_skipped= 1;
4407 }
4408
4409 /**
4410  * decodes a macroblock
4411  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4412  */
4413 static int decode_mb_cavlc(H264Context *h){
4414     MpegEncContext * const s = &h->s;
4415     int mb_xy;
4416     int partition_count;
4417     unsigned int mb_type, cbp;
4418     int dct8x8_allowed= h->pps.transform_8x8_mode;
4419
4420     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4421
4422     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4423     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4424                 down the code */
4425     if(h->slice_type_nos != FF_I_TYPE){
4426         if(s->mb_skip_run==-1)
4427             s->mb_skip_run= get_ue_golomb(&s->gb);
4428
4429         if (s->mb_skip_run--) {
4430             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4431                 if(s->mb_skip_run==0)
4432                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4433                 else
4434                     predict_field_decoding_flag(h);
4435             }
4436             decode_mb_skip(h);
4437             return 0;
4438         }
4439     }
4440     if(FRAME_MBAFF){
4441         if( (s->mb_y&1) == 0 )
4442             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4443     }
4444
4445     h->prev_mb_skipped= 0;
4446
4447     mb_type= get_ue_golomb(&s->gb);
4448     if(h->slice_type_nos == FF_B_TYPE){
4449         if(mb_type < 23){
4450             partition_count= b_mb_type_info[mb_type].partition_count;
4451             mb_type=         b_mb_type_info[mb_type].type;
4452         }else{
4453             mb_type -= 23;
4454             goto decode_intra_mb;
4455         }
4456     }else if(h->slice_type_nos == FF_P_TYPE){
4457         if(mb_type < 5){
4458             partition_count= p_mb_type_info[mb_type].partition_count;
4459             mb_type=         p_mb_type_info[mb_type].type;
4460         }else{
4461             mb_type -= 5;
4462             goto decode_intra_mb;
4463         }
4464     }else{
4465        assert(h->slice_type_nos == FF_I_TYPE);
4466         if(h->slice_type == FF_SI_TYPE && mb_type)
4467             mb_type--;
4468 decode_intra_mb:
4469         if(mb_type > 25){
4470             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4471             return -1;
4472         }
4473         partition_count=0;
4474         cbp= i_mb_type_info[mb_type].cbp;
4475         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4476         mb_type= i_mb_type_info[mb_type].type;
4477     }
4478
4479     if(MB_FIELD)
4480         mb_type |= MB_TYPE_INTERLACED;
4481
4482     h->slice_table[ mb_xy ]= h->slice_num;
4483
4484     if(IS_INTRA_PCM(mb_type)){
4485         unsigned int x;
4486
4487         // We assume these blocks are very rare so we do not optimize it.
4488         align_get_bits(&s->gb);
4489
4490         // The pixels are stored in the same order as levels in h->mb array.
4491         for(x=0; x < (CHROMA ? 384 : 256); x++){
4492             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4493         }
4494
4495         // In deblocking, the quantizer is 0
4496         s->current_picture.qscale_table[mb_xy]= 0;
4497         // All coeffs are present
4498         memset(h->non_zero_count[mb_xy], 16, 16);
4499
4500         s->current_picture.mb_type[mb_xy]= mb_type;
4501         return 0;
4502     }
4503
4504     if(MB_MBAFF){
4505         h->ref_count[0] <<= 1;
4506         h->ref_count[1] <<= 1;
4507     }
4508
4509     fill_caches(h, mb_type, 0);
4510
4511     //mb_pred
4512     if(IS_INTRA(mb_type)){
4513         int pred_mode;
4514 //            init_top_left_availability(h);
4515         if(IS_INTRA4x4(mb_type)){
4516             int i;
4517             int di = 1;
4518             if(dct8x8_allowed && get_bits1(&s->gb)){
4519                 mb_type |= MB_TYPE_8x8DCT;
4520                 di = 4;
4521             }
4522
4523 //                fill_intra4x4_pred_table(h);
4524             for(i=0; i<16; i+=di){
4525                 int mode= pred_intra_mode(h, i);
4526
4527                 if(!get_bits1(&s->gb)){
4528                     const int rem_mode= get_bits(&s->gb, 3);
4529                     mode = rem_mode + (rem_mode >= mode);
4530                 }
4531
4532                 if(di==4)
4533                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4534                 else
4535                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4536             }
4537             write_back_intra_pred_mode(h);
4538             if( check_intra4x4_pred_mode(h) < 0)
4539                 return -1;
4540         }else{
4541             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4542             if(h->intra16x16_pred_mode < 0)
4543                 return -1;
4544         }
4545         if(CHROMA){
4546             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4547             if(pred_mode < 0)
4548                 return -1;
4549             h->chroma_pred_mode= pred_mode;
4550         }
4551     }else if(partition_count==4){
4552         int i, j, sub_partition_count[4], list, ref[2][4];
4553
4554         if(h->slice_type_nos == FF_B_TYPE){
4555             for(i=0; i<4; i++){
4556                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4557                 if(h->sub_mb_type[i] >=13){
4558                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4559                     return -1;
4560                 }
4561                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4562                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4563             }
4564             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4565                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4566                 pred_direct_motion(h, &mb_type);
4567                 h->ref_cache[0][scan8[4]] =
4568                 h->ref_cache[1][scan8[4]] =
4569                 h->ref_cache[0][scan8[12]] =
4570                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4571             }
4572         }else{
4573             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4574             for(i=0; i<4; i++){
4575                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4576                 if(h->sub_mb_type[i] >=4){
4577                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4578                     return -1;
4579                 }
4580                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4581                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4582             }
4583         }
4584
4585         for(list=0; list<h->list_count; list++){
4586             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4587             for(i=0; i<4; i++){
4588                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4589                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4590                     unsigned int tmp;
4591                     if(ref_count == 1){
4592                         tmp= 0;
4593                     }else if(ref_count == 2){
4594                         tmp= get_bits1(&s->gb)^1;
4595                     }else{
4596                         tmp= get_ue_golomb_31(&s->gb);
4597                         if(tmp>=ref_count){
4598                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4599                             return -1;
4600                         }
4601                     }
4602                     ref[list][i]= tmp;
4603                 }else{
4604                  //FIXME
4605                     ref[list][i] = -1;
4606                 }
4607             }
4608         }
4609
4610         if(dct8x8_allowed)
4611             dct8x8_allowed = get_dct8x8_allowed(h);
4612
4613         for(list=0; list<h->list_count; list++){
4614             for(i=0; i<4; i++){
4615                 if(IS_DIRECT(h->sub_mb_type[i])) {
4616                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4617                     continue;
4618                 }
4619                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4620                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4621
4622                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4623                     const int sub_mb_type= h->sub_mb_type[i];
4624                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4625                     for(j=0; j<sub_partition_count[i]; j++){
4626                         int mx, my;
4627                         const int index= 4*i + block_width*j;
4628                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4629                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4630                         mx += get_se_golomb(&s->gb);
4631                         my += get_se_golomb(&s->gb);
4632                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4633
4634                         if(IS_SUB_8X8(sub_mb_type)){
4635                             mv_cache[ 1 ][0]=
4636                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4637                             mv_cache[ 1 ][1]=
4638                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4639                         }else if(IS_SUB_8X4(sub_mb_type)){
4640                             mv_cache[ 1 ][0]= mx;
4641                             mv_cache[ 1 ][1]= my;
4642                         }else if(IS_SUB_4X8(sub_mb_type)){
4643                             mv_cache[ 8 ][0]= mx;
4644                             mv_cache[ 8 ][1]= my;
4645                         }
4646                         mv_cache[ 0 ][0]= mx;
4647                         mv_cache[ 0 ][1]= my;
4648                     }
4649                 }else{
4650                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4651                     p[0] = p[1]=
4652                     p[8] = p[9]= 0;
4653                 }
4654             }
4655         }
4656     }else if(IS_DIRECT(mb_type)){
4657         pred_direct_motion(h, &mb_type);
4658         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4659     }else{
4660         int list, mx, my, i;
4661          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4662         if(IS_16X16(mb_type)){
4663             for(list=0; list<h->list_count; list++){
4664                     unsigned int val;
4665                     if(IS_DIR(mb_type, 0, list)){
4666                         if(h->ref_count[list]==1){
4667                             val= 0;
4668                         }else if(h->ref_count[list]==2){
4669                             val= get_bits1(&s->gb)^1;
4670                         }else{
4671                             val= get_ue_golomb_31(&s->gb);
4672                             if(val >= h->ref_count[list]){
4673                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4674                                 return -1;
4675                             }
4676                         }
4677                     }else
4678                         val= LIST_NOT_USED&0xFF;
4679                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4680             }
4681             for(list=0; list<h->list_count; list++){
4682                 unsigned int val;
4683                 if(IS_DIR(mb_type, 0, list)){
4684                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4685                     mx += get_se_golomb(&s->gb);
4686                     my += get_se_golomb(&s->gb);
4687                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4688
4689                     val= pack16to32(mx,my);
4690                 }else
4691                     val=0;
4692                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4693             }
4694         }
4695         else if(IS_16X8(mb_type)){
4696             for(list=0; list<h->list_count; list++){
4697                     for(i=0; i<2; i++){
4698                         unsigned int val;
4699                         if(IS_DIR(mb_type, i, list)){
4700                             if(h->ref_count[list] == 1){
4701                                 val= 0;
4702                             }else if(h->ref_count[list] == 2){
4703                                 val= get_bits1(&s->gb)^1;
4704                             }else{
4705                                 val= get_ue_golomb_31(&s->gb);
4706                                 if(val >= h->ref_count[list]){
4707                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4708                                     return -1;
4709                                 }
4710                             }
4711                         }else
4712                             val= LIST_NOT_USED&0xFF;
4713                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4714                     }
4715             }
4716             for(list=0; list<h->list_count; list++){
4717                 for(i=0; i<2; i++){
4718                     unsigned int val;
4719                     if(IS_DIR(mb_type, i, list)){
4720                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4721                         mx += get_se_golomb(&s->gb);
4722                         my += get_se_golomb(&s->gb);
4723                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4724
4725                         val= pack16to32(mx,my);
4726                     }else
4727                         val=0;
4728                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4729                 }
4730             }
4731         }else{
4732             assert(IS_8X16(mb_type));
4733             for(list=0; list<h->list_count; list++){
4734                     for(i=0; i<2; i++){
4735                         unsigned int val;
4736                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4737                             if(h->ref_count[list]==1){
4738                                 val= 0;
4739                             }else if(h->ref_count[list]==2){
4740                                 val= get_bits1(&s->gb)^1;
4741                             }else{
4742                                 val= get_ue_golomb_31(&s->gb);
4743                                 if(val >= h->ref_count[list]){
4744                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4745                                     return -1;
4746                                 }
4747                             }
4748                         }else
4749                             val= LIST_NOT_USED&0xFF;
4750                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4751                     }
4752             }
4753             for(list=0; list<h->list_count; list++){
4754                 for(i=0; i<2; i++){
4755                     unsigned int val;
4756                     if(IS_DIR(mb_type, i, list)){
4757                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4758                         mx += get_se_golomb(&s->gb);
4759                         my += get_se_golomb(&s->gb);
4760                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4761
4762                         val= pack16to32(mx,my);
4763                     }else
4764                         val=0;
4765                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4766                 }
4767             }
4768         }
4769     }
4770
4771     if(IS_INTER(mb_type))
4772         write_back_motion(h, mb_type);
4773
4774     if(!IS_INTRA16x16(mb_type)){
4775         cbp= get_ue_golomb(&s->gb);
4776         if(cbp > 47){
4777             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4778             return -1;
4779         }
4780
4781         if(CHROMA){
4782             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4783             else                     cbp= golomb_to_inter_cbp   [cbp];
4784         }else{
4785             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4786             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4787         }
4788     }
4789     h->cbp = cbp;
4790
4791     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4792         if(get_bits1(&s->gb)){
4793             mb_type |= MB_TYPE_8x8DCT;
4794             h->cbp_table[mb_xy]= cbp;
4795         }
4796     }
4797     s->current_picture.mb_type[mb_xy]= mb_type;
4798
4799     if(cbp || IS_INTRA16x16(mb_type)){
4800         int i8x8, i4x4, chroma_idx;
4801         int dquant;
4802         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4803         const uint8_t *scan, *scan8x8, *dc_scan;
4804
4805 //        fill_non_zero_count_cache(h);
4806
4807         if(IS_INTERLACED(mb_type)){
4808             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4809             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4810             dc_scan= luma_dc_field_scan;
4811         }else{
4812             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4813             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4814             dc_scan= luma_dc_zigzag_scan;
4815         }
4816
4817         dquant= get_se_golomb(&s->gb);
4818
4819         if( dquant > 25 || dquant < -26 ){
4820             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4821             return -1;
4822         }
4823
4824         s->qscale += dquant;
4825         if(((unsigned)s->qscale) > 51){
4826             if(s->qscale<0) s->qscale+= 52;
4827             else            s->qscale-= 52;
4828         }
4829
4830         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4831         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4832         if(IS_INTRA16x16(mb_type)){
4833             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4834                 return -1; //FIXME continue if partitioned and other return -1 too
4835             }
4836
4837             assert((cbp&15) == 0 || (cbp&15) == 15);
4838
4839             if(cbp&15){
4840                 for(i8x8=0; i8x8<4; i8x8++){
4841                     for(i4x4=0; i4x4<4; i4x4++){
4842                         const int index= i4x4 + 4*i8x8;
4843                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4844                             return -1;
4845                         }
4846                     }
4847                 }
4848             }else{
4849                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4850             }
4851         }else{
4852             for(i8x8=0; i8x8<4; i8x8++){
4853                 if(cbp & (1<<i8x8)){
4854                     if(IS_8x8DCT(mb_type)){
4855                         DCTELEM *buf = &h->mb[64*i8x8];
4856                         uint8_t *nnz;
4857                         for(i4x4=0; i4x4<4; i4x4++){
4858                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4859                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4860                                 return -1;
4861                         }
4862                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4863                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4864                     }else{
4865                         for(i4x4=0; i4x4<4; i4x4++){
4866                             const int index= i4x4 + 4*i8x8;
4867
4868                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4869                                 return -1;
4870                             }
4871                         }
4872                     }
4873                 }else{
4874                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4875                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4876                 }
4877             }
4878         }
4879
4880         if(cbp&0x30){
4881             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4882                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4883                     return -1;
4884                 }
4885         }
4886
4887         if(cbp&0x20){
4888             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4889                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4890                 for(i4x4=0; i4x4<4; i4x4++){
4891                     const int index= 16 + 4*chroma_idx + i4x4;
4892                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4893                         return -1;
4894                     }
4895                 }
4896             }
4897         }else{
4898             uint8_t * const nnz= &h->non_zero_count_cache[0];
4899             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4900             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4901         }
4902     }else{
4903         uint8_t * const nnz= &h->non_zero_count_cache[0];
4904         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4905         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4906         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4907     }
4908     s->current_picture.qscale_table[mb_xy]= s->qscale;
4909     write_back_non_zero_count(h);
4910
4911     if(MB_MBAFF){
4912         h->ref_count[0] >>= 1;
4913         h->ref_count[1] >>= 1;
4914     }
4915
4916     return 0;
4917 }
4918
4919 static int decode_cabac_field_decoding_flag(H264Context *h) {
4920     MpegEncContext * const s = &h->s;
4921     const int mb_x = s->mb_x;
4922     const int mb_y = s->mb_y & ~1;
4923     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4924     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4925
4926     unsigned int ctx = 0;
4927
4928     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4929         ctx += 1;
4930     }
4931     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4932         ctx += 1;
4933     }
4934
4935     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4936 }
4937
4938 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4939     uint8_t *state= &h->cabac_state[ctx_base];
4940     int mb_type;
4941
4942     if(intra_slice){
4943         MpegEncContext * const s = &h->s;
4944         const int mba_xy = h->left_mb_xy[0];
4945         const int mbb_xy = h->top_mb_xy;
4946         int ctx=0;
4947         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4948             ctx++;
4949         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4950             ctx++;
4951         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4952             return 0;   /* I4x4 */
4953         state += 2;
4954     }else{
4955         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4956             return 0;   /* I4x4 */
4957     }
4958
4959     if( get_cabac_terminate( &h->cabac ) )
4960         return 25;  /* PCM */
4961
4962     mb_type = 1; /* I16x16 */
4963     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4964     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4965         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4966     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4967     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4968     return mb_type;
4969 }
4970
4971 static int decode_cabac_mb_type_b( H264Context *h ) {
4972     MpegEncContext * const s = &h->s;
4973
4974         const int mba_xy = h->left_mb_xy[0];
4975         const int mbb_xy = h->top_mb_xy;
4976         int ctx = 0;
4977         int bits;
4978         assert(h->slice_type_nos == FF_B_TYPE);
4979
4980         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4981             ctx++;
4982         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4983             ctx++;
4984
4985         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4986             return 0; /* B_Direct_16x16 */
4987
4988         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4989             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4990         }
4991
4992         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4993         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4994         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4995         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4996         if( bits < 8 )
4997             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4998         else if( bits == 13 ) {
4999             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5000         } else if( bits == 14 )
5001             return 11; /* B_L1_L0_8x16 */
5002         else if( bits == 15 )
5003             return 22; /* B_8x8 */
5004
5005         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5006         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5007 }
5008
5009 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5010     MpegEncContext * const s = &h->s;
5011     int mba_xy, mbb_xy;
5012     int ctx = 0;
5013
5014     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5015         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5016         mba_xy = mb_xy - 1;
5017         if( (mb_y&1)
5018             && h->slice_table[mba_xy] == h->slice_num
5019             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5020             mba_xy += s->mb_stride;
5021         if( MB_FIELD ){
5022             mbb_xy = mb_xy - s->mb_stride;
5023             if( !(mb_y&1)
5024                 && h->slice_table[mbb_xy] == h->slice_num
5025                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5026                 mbb_xy -= s->mb_stride;
5027         }else
5028             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5029     }else{
5030         int mb_xy = h->mb_xy;
5031         mba_xy = mb_xy - 1;
5032         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5033     }
5034
5035     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5036         ctx++;
5037     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5038         ctx++;
5039
5040     if( h->slice_type_nos == FF_B_TYPE )
5041         ctx += 13;
5042     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5043 }
5044
5045 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5046     int mode = 0;
5047
5048     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5049         return pred_mode;
5050
5051     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5052     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5053     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5054
5055     if( mode >= pred_mode )
5056         return mode + 1;
5057     else
5058         return mode;
5059 }
5060
5061 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5062     const int mba_xy = h->left_mb_xy[0];
5063     const int mbb_xy = h->top_mb_xy;
5064
5065     int ctx = 0;
5066
5067     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5068     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5069         ctx++;
5070
5071     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5072         ctx++;
5073
5074     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5075         return 0;
5076
5077     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5078         return 1;
5079     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5080         return 2;
5081     else
5082         return 3;
5083 }
5084
5085 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5086     int cbp_b, cbp_a, ctx, cbp = 0;
5087
5088     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5089     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5090
5091     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5092     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5093     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5094     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5095     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5096     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5097     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5098     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5099     return cbp;
5100 }
5101 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5102     int ctx;
5103     int cbp_a, cbp_b;
5104
5105     cbp_a = (h->left_cbp>>4)&0x03;
5106     cbp_b = (h-> top_cbp>>4)&0x03;
5107
5108     ctx = 0;
5109     if( cbp_a > 0 ) ctx++;
5110     if( cbp_b > 0 ) ctx += 2;
5111     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5112         return 0;
5113
5114     ctx = 4;
5115     if( cbp_a == 2 ) ctx++;
5116     if( cbp_b == 2 ) ctx += 2;
5117     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5118 }
5119 static int decode_cabac_mb_dqp( H264Context *h) {
5120     int   ctx= h->last_qscale_diff != 0;
5121     int   val = 0;
5122
5123     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5124         ctx= 2+(ctx>>1);
5125         val++;
5126         if(val > 102) //prevent infinite loop
5127             return INT_MIN;
5128     }
5129
5130     if( val&0x01 )
5131         return   (val + 1)>>1 ;
5132     else
5133         return -((val + 1)>>1);
5134 }
5135 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5136     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5137         return 0;   /* 8x8 */
5138     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5139         return 1;   /* 8x4 */
5140     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5141         return 2;   /* 4x8 */
5142     return 3;       /* 4x4 */
5143 }
5144 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5145     int type;
5146     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5147         return 0;   /* B_Direct_8x8 */
5148     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5149         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5150     type = 3;
5151     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5152         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5153             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5154         type += 4;
5155     }
5156     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5157     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5158     return type;
5159 }
5160
5161 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5162     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5163 }
5164
5165 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5166     int refa = h->ref_cache[list][scan8[n] - 1];
5167     int refb = h->ref_cache[list][scan8[n] - 8];
5168     int ref  = 0;
5169     int ctx  = 0;
5170
5171     if( h->slice_type_nos == FF_B_TYPE) {
5172         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5173             ctx++;
5174         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5175             ctx += 2;
5176     } else {
5177         if( refa > 0 )
5178             ctx++;
5179         if( refb > 0 )
5180             ctx += 2;
5181     }
5182
5183     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5184         ref++;
5185         ctx = (ctx>>2)+4;
5186         if(ref >= 32 /*h->ref_list[list]*/){
5187             return -1;
5188         }
5189     }
5190     return ref;
5191 }
5192
5193 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5194     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5195                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5196     int ctxbase = (l == 0) ? 40 : 47;
5197     int mvd;
5198     int ctx = (amvd>2) + (amvd>32);
5199
5200     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5201         return 0;
5202
5203     mvd= 1;
5204     ctx= 3;
5205     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5206         mvd++;
5207         if( ctx < 6 )
5208             ctx++;
5209     }
5210
5211     if( mvd >= 9 ) {
5212         int k = 3;
5213         while( get_cabac_bypass( &h->cabac ) ) {
5214             mvd += 1 << k;
5215             k++;
5216             if(k>24){
5217                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5218                 return INT_MIN;
5219             }
5220         }
5221         while( k-- ) {
5222             if( get_cabac_bypass( &h->cabac ) )
5223                 mvd += 1 << k;
5224         }
5225     }
5226     return get_cabac_bypass_sign( &h->cabac, -mvd );
5227 }
5228
5229 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5230     int nza, nzb;
5231     int ctx = 0;
5232
5233     if( is_dc ) {
5234         if( cat == 0 ) {
5235             nza = h->left_cbp&0x100;
5236             nzb = h-> top_cbp&0x100;
5237         } else {
5238             nza = (h->left_cbp>>(6+idx))&0x01;
5239             nzb = (h-> top_cbp>>(6+idx))&0x01;
5240         }
5241     } else {
5242         assert(cat == 1 || cat == 2 || cat == 4);
5243         nza = h->non_zero_count_cache[scan8[idx] - 1];
5244         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5245     }
5246
5247     if( nza > 0 )
5248         ctx++;
5249
5250     if( nzb > 0 )
5251         ctx += 2;
5252
5253     return ctx + 4 * cat;
5254 }
5255
5256 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5257     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5258     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5259     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5260     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5261 };
5262
5263 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5264     static const int significant_coeff_flag_offset[2][6] = {
5265       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5266       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5267     };
5268     static const int last_coeff_flag_offset[2][6] = {
5269       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5270       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5271     };
5272     static const int coeff_abs_level_m1_offset[6] = {
5273         227+0, 227+10, 227+20, 227+30, 227+39, 426
5274     };
5275     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5276       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5277         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5278         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5279        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5280       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5281         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5282         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5283         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5284     };
5285     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5286      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5287      * map node ctx => cabac ctx for level=1 */
5288     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5289     /* map node ctx => cabac ctx for level>1 */
5290     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5291     static const uint8_t coeff_abs_level_transition[2][8] = {
5292     /* update node ctx after decoding a level=1 */
5293         { 1, 2, 3, 3, 4, 5, 6, 7 },
5294     /* update node ctx after decoding a level>1 */
5295         { 4, 4, 4, 4, 5, 6, 7, 7 }
5296     };
5297
5298     int index[64];
5299
5300     int av_unused last;
5301     int coeff_count = 0;
5302     int node_ctx = 0;
5303
5304     uint8_t *significant_coeff_ctx_base;
5305     uint8_t *last_coeff_ctx_base;
5306     uint8_t *abs_level_m1_ctx_base;
5307
5308 #if !ARCH_X86
5309 #define CABAC_ON_STACK
5310 #endif
5311 #ifdef CABAC_ON_STACK
5312 #define CC &cc
5313     CABACContext cc;
5314     cc.range     = h->cabac.range;
5315     cc.low       = h->cabac.low;
5316     cc.bytestream= h->cabac.bytestream;
5317 #else
5318 #define CC &h->cabac
5319 #endif
5320
5321
5322     /* cat: 0-> DC 16x16  n = 0
5323      *      1-> AC 16x16  n = luma4x4idx
5324      *      2-> Luma4x4   n = luma4x4idx
5325      *      3-> DC Chroma n = iCbCr
5326      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5327      *      5-> Luma8x8   n = 4 * luma8x8idx
5328      */
5329
5330     /* read coded block flag */
5331     if( is_dc || cat != 5 ) {
5332         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5333             if( !is_dc )
5334                 h->non_zero_count_cache[scan8[n]] = 0;
5335
5336 #ifdef CABAC_ON_STACK
5337             h->cabac.range     = cc.range     ;
5338             h->cabac.low       = cc.low       ;
5339             h->cabac.bytestream= cc.bytestream;
5340 #endif
5341             return;
5342         }
5343     }
5344
5345     significant_coeff_ctx_base = h->cabac_state
5346         + significant_coeff_flag_offset[MB_FIELD][cat];
5347     last_coeff_ctx_base = h->cabac_state
5348         + last_coeff_flag_offset[MB_FIELD][cat];
5349     abs_level_m1_ctx_base = h->cabac_state
5350         + coeff_abs_level_m1_offset[cat];
5351
5352     if( !is_dc && cat == 5 ) {
5353 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5354         for(last= 0; last < coefs; last++) { \
5355             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5356             if( get_cabac( CC, sig_ctx )) { \
5357                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5358                 index[coeff_count++] = last; \
5359                 if( get_cabac( CC, last_ctx ) ) { \
5360                     last= max_coeff; \
5361                     break; \
5362                 } \
5363             } \
5364         }\
5365         if( last == max_coeff -1 ) {\
5366             index[coeff_count++] = last;\
5367         }
5368         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5369 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5370         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5371     } else {
5372         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5373 #else
5374         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5375     } else {
5376         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5377 #endif
5378     }
5379     assert(coeff_count > 0);
5380
5381     if( is_dc ) {
5382         if( cat == 0 )
5383             h->cbp_table[h->mb_xy] |= 0x100;
5384         else
5385             h->cbp_table[h->mb_xy] |= 0x40 << n;
5386     } else {
5387         if( cat == 5 )
5388             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5389         else {
5390             assert( cat == 1 || cat == 2 || cat == 4 );
5391             h->non_zero_count_cache[scan8[n]] = coeff_count;
5392         }
5393     }
5394
5395     do {
5396         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5397
5398         int j= scantable[index[--coeff_count]];
5399
5400         if( get_cabac( CC, ctx ) == 0 ) {
5401             node_ctx = coeff_abs_level_transition[0][node_ctx];
5402             if( is_dc ) {
5403                 block[j] = get_cabac_bypass_sign( CC, -1);
5404             }else{
5405                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5406             }
5407         } else {
5408             int coeff_abs = 2;
5409             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5410             node_ctx = coeff_abs_level_transition[1][node_ctx];
5411
5412             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5413                 coeff_abs++;
5414             }
5415
5416             if( coeff_abs >= 15 ) {
5417                 int j = 0;
5418                 while( get_cabac_bypass( CC ) ) {
5419                     j++;
5420                 }
5421
5422                 coeff_abs=1;
5423                 while( j-- ) {
5424                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5425                 }
5426                 coeff_abs+= 14;
5427             }
5428
5429             if( is_dc ) {
5430                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5431             }else{
5432                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5433             }
5434         }
5435     } while( coeff_count );
5436 #ifdef CABAC_ON_STACK
5437             h->cabac.range     = cc.range     ;
5438             h->cabac.low       = cc.low       ;
5439             h->cabac.bytestream= cc.bytestream;
5440 #endif
5441
5442 }
5443
5444 #if !CONFIG_SMALL
5445 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5446     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5447 }
5448
5449 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5450     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5451 }
5452 #endif
5453
5454 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5455 #if CONFIG_SMALL
5456     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5457 #else
5458     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5459     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5460 #endif
5461 }
5462
5463 static inline void compute_mb_neighbors(H264Context *h)
5464 {
5465     MpegEncContext * const s = &h->s;
5466     const int mb_xy  = h->mb_xy;
5467     h->top_mb_xy     = mb_xy - s->mb_stride;
5468     h->left_mb_xy[0] = mb_xy - 1;
5469     if(FRAME_MBAFF){
5470         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5471         const int top_pair_xy      = pair_xy     - s->mb_stride;
5472         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5473         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5474         const int curr_mb_field_flag = MB_FIELD;
5475         const int bottom = (s->mb_y & 1);
5476
5477         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5478             h->top_mb_xy -= s->mb_stride;
5479         }
5480         if (!left_mb_field_flag == curr_mb_field_flag) {
5481             h->left_mb_xy[0] = pair_xy - 1;
5482         }
5483     } else if (FIELD_PICTURE) {
5484         h->top_mb_xy -= s->mb_stride;
5485     }
5486     return;
5487 }
5488
5489 /**
5490  * decodes a macroblock
5491  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5492  */
5493 static int decode_mb_cabac(H264Context *h) {
5494     MpegEncContext * const s = &h->s;
5495     int mb_xy;
5496     int mb_type, partition_count, cbp = 0;
5497     int dct8x8_allowed= h->pps.transform_8x8_mode;
5498
5499     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5500
5501     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5502     if( h->slice_type_nos != FF_I_TYPE ) {
5503         int skip;
5504         /* a skipped mb needs the aff flag from the following mb */
5505         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5506             predict_field_decoding_flag(h);
5507         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5508             skip = h->next_mb_skipped;
5509         else
5510             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5511         /* read skip flags */
5512         if( skip ) {
5513             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5514                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5515                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5516                 if(!h->next_mb_skipped)
5517                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5518             }
5519
5520             decode_mb_skip(h);
5521
5522             h->cbp_table[mb_xy] = 0;
5523             h->chroma_pred_mode_table[mb_xy] = 0;
5524             h->last_qscale_diff = 0;
5525
5526             return 0;
5527
5528         }
5529     }
5530     if(FRAME_MBAFF){
5531         if( (s->mb_y&1) == 0 )
5532             h->mb_mbaff =
5533             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5534     }
5535
5536     h->prev_mb_skipped = 0;
5537
5538     compute_mb_neighbors(h);
5539
5540     if( h->slice_type_nos == FF_B_TYPE ) {
5541         mb_type = decode_cabac_mb_type_b( h );
5542         if( mb_type < 23 ){
5543             partition_count= b_mb_type_info[mb_type].partition_count;
5544             mb_type=         b_mb_type_info[mb_type].type;
5545         }else{
5546             mb_type -= 23;
5547             goto decode_intra_mb;
5548         }
5549     } else if( h->slice_type_nos == FF_P_TYPE ) {
5550         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5551             /* P-type */
5552             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5553                 /* P_L0_D16x16, P_8x8 */
5554                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5555             } else {
5556                 /* P_L0_D8x16, P_L0_D16x8 */
5557                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5558             }
5559             partition_count= p_mb_type_info[mb_type].partition_count;
5560             mb_type=         p_mb_type_info[mb_type].type;
5561         } else {
5562             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5563             goto decode_intra_mb;
5564         }
5565     } else {
5566         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5567         if(h->slice_type == FF_SI_TYPE && mb_type)
5568             mb_type--;
5569         assert(h->slice_type_nos == FF_I_TYPE);
5570 decode_intra_mb:
5571         partition_count = 0;
5572         cbp= i_mb_type_info[mb_type].cbp;
5573         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5574         mb_type= i_mb_type_info[mb_type].type;
5575     }
5576     if(MB_FIELD)
5577         mb_type |= MB_TYPE_INTERLACED;
5578
5579     h->slice_table[ mb_xy ]= h->slice_num;
5580
5581     if(IS_INTRA_PCM(mb_type)) {
5582         const uint8_t *ptr;
5583
5584         // We assume these blocks are very rare so we do not optimize it.
5585         // FIXME The two following lines get the bitstream position in the cabac
5586         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5587         ptr= h->cabac.bytestream;
5588         if(h->cabac.low&0x1) ptr--;
5589         if(CABAC_BITS==16){
5590             if(h->cabac.low&0x1FF) ptr--;
5591         }
5592
5593         // The pixels are stored in the same order as levels in h->mb array.
5594         memcpy(h->mb, ptr, 256); ptr+=256;
5595         if(CHROMA){
5596             memcpy(h->mb+128, ptr, 128); ptr+=128;
5597         }
5598
5599         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5600
5601         // All blocks are present
5602         h->cbp_table[mb_xy] = 0x1ef;
5603         h->chroma_pred_mode_table[mb_xy] = 0;
5604         // In deblocking, the quantizer is 0
5605         s->current_picture.qscale_table[mb_xy]= 0;
5606         // All coeffs are present
5607         memset(h->non_zero_count[mb_xy], 16, 16);
5608         s->current_picture.mb_type[mb_xy]= mb_type;
5609         h->last_qscale_diff = 0;
5610         return 0;
5611     }
5612
5613     if(MB_MBAFF){
5614         h->ref_count[0] <<= 1;
5615         h->ref_count[1] <<= 1;
5616     }
5617
5618     fill_caches(h, mb_type, 0);
5619
5620     if( IS_INTRA( mb_type ) ) {
5621         int i, pred_mode;
5622         if( IS_INTRA4x4( mb_type ) ) {
5623             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5624                 mb_type |= MB_TYPE_8x8DCT;
5625                 for( i = 0; i < 16; i+=4 ) {
5626                     int pred = pred_intra_mode( h, i );
5627                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5628                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5629                 }
5630             } else {
5631                 for( i = 0; i < 16; i++ ) {
5632                     int pred = pred_intra_mode( h, i );
5633                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5634
5635                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5636                 }
5637             }
5638             write_back_intra_pred_mode(h);
5639             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5640         } else {
5641             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5642             if( h->intra16x16_pred_mode < 0 ) return -1;
5643         }
5644         if(CHROMA){
5645             h->chroma_pred_mode_table[mb_xy] =
5646             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5647
5648             pred_mode= check_intra_pred_mode( h, pred_mode );
5649             if( pred_mode < 0 ) return -1;
5650             h->chroma_pred_mode= pred_mode;
5651         }
5652     } else if( partition_count == 4 ) {
5653         int i, j, sub_partition_count[4], list, ref[2][4];
5654
5655         if( h->slice_type_nos == FF_B_TYPE ) {
5656             for( i = 0; i < 4; i++ ) {
5657                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5658                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5659                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5660             }
5661             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5662                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5663                 pred_direct_motion(h, &mb_type);
5664                 h->ref_cache[0][scan8[4]] =
5665                 h->ref_cache[1][scan8[4]] =
5666                 h->ref_cache[0][scan8[12]] =
5667                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5668                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5669                     for( i = 0; i < 4; i++ )
5670                         if( IS_DIRECT(h->sub_mb_type[i]) )
5671                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5672                 }
5673             }
5674         } else {
5675             for( i = 0; i < 4; i++ ) {
5676                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5677                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5678                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5679             }
5680         }
5681
5682         for( list = 0; list < h->list_count; list++ ) {
5683                 for( i = 0; i < 4; i++ ) {
5684                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5685                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5686                         if( h->ref_count[list] > 1 ){
5687                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5688                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5689                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5690                                 return -1;
5691                             }
5692                         }else
5693                             ref[list][i] = 0;
5694                     } else {
5695                         ref[list][i] = -1;
5696                     }
5697                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5698                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5699                 }
5700         }
5701
5702         if(dct8x8_allowed)
5703             dct8x8_allowed = get_dct8x8_allowed(h);
5704
5705         for(list=0; list<h->list_count; list++){
5706             for(i=0; i<4; i++){
5707                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5708                 if(IS_DIRECT(h->sub_mb_type[i])){
5709                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5710                     continue;
5711                 }
5712
5713                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5714                     const int sub_mb_type= h->sub_mb_type[i];
5715                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5716                     for(j=0; j<sub_partition_count[i]; j++){
5717                         int mpx, mpy;
5718                         int mx, my;
5719                         const int index= 4*i + block_width*j;
5720                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5721                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5722                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5723
5724                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5725                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5726                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5727
5728                         if(IS_SUB_8X8(sub_mb_type)){
5729                             mv_cache[ 1 ][0]=
5730                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5731                             mv_cache[ 1 ][1]=
5732                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5733
5734                             mvd_cache[ 1 ][0]=
5735                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5736                             mvd_cache[ 1 ][1]=
5737                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5738                         }else if(IS_SUB_8X4(sub_mb_type)){
5739                             mv_cache[ 1 ][0]= mx;
5740                             mv_cache[ 1 ][1]= my;
5741
5742                             mvd_cache[ 1 ][0]= mx - mpx;
5743                             mvd_cache[ 1 ][1]= my - mpy;
5744                         }else if(IS_SUB_4X8(sub_mb_type)){
5745                             mv_cache[ 8 ][0]= mx;
5746                             mv_cache[ 8 ][1]= my;
5747
5748                             mvd_cache[ 8 ][0]= mx - mpx;
5749                             mvd_cache[ 8 ][1]= my - mpy;
5750                         }
5751                         mv_cache[ 0 ][0]= mx;
5752                         mv_cache[ 0 ][1]= my;
5753
5754                         mvd_cache[ 0 ][0]= mx - mpx;
5755                         mvd_cache[ 0 ][1]= my - mpy;
5756                     }
5757                 }else{
5758                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5759                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5760                     p[0] = p[1] = p[8] = p[9] = 0;
5761                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5762                 }
5763             }
5764         }
5765     } else if( IS_DIRECT(mb_type) ) {
5766         pred_direct_motion(h, &mb_type);
5767         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5768         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5769         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5770     } else {
5771         int list, mx, my, i, mpx, mpy;
5772         if(IS_16X16(mb_type)){
5773             for(list=0; list<h->list_count; list++){
5774                 if(IS_DIR(mb_type, 0, list)){
5775                     int ref;
5776                     if(h->ref_count[list] > 1){
5777                         ref= decode_cabac_mb_ref(h, list, 0);
5778                         if(ref >= (unsigned)h->ref_count[list]){
5779                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5780                             return -1;
5781                         }
5782                     }else
5783                         ref=0;
5784                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5785                 }else
5786                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5787             }
5788             for(list=0; list<h->list_count; list++){
5789                 if(IS_DIR(mb_type, 0, list)){
5790                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5791
5792                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5793                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5794                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5795
5796                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5797                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5798                 }else
5799                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5800             }
5801         }
5802         else if(IS_16X8(mb_type)){
5803             for(list=0; list<h->list_count; list++){
5804                     for(i=0; i<2; i++){
5805                         if(IS_DIR(mb_type, i, list)){
5806                             int ref;
5807                             if(h->ref_count[list] > 1){
5808                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5809                                 if(ref >= (unsigned)h->ref_count[list]){
5810                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5811                                     return -1;
5812                                 }
5813                             }else
5814                                 ref=0;
5815                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5816                         }else
5817                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5818                     }
5819             }
5820             for(list=0; list<h->list_count; list++){
5821                 for(i=0; i<2; i++){
5822                     if(IS_DIR(mb_type, i, list)){
5823                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5824                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5825                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5826                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5827
5828                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5829                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5830                     }else{
5831                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5832                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5833                     }
5834                 }
5835             }
5836         }else{
5837             assert(IS_8X16(mb_type));
5838             for(list=0; list<h->list_count; list++){
5839                     for(i=0; i<2; i++){
5840                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5841                             int ref;
5842                             if(h->ref_count[list] > 1){
5843                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5844                                 if(ref >= (unsigned)h->ref_count[list]){
5845                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5846                                     return -1;
5847                                 }
5848                             }else
5849                                 ref=0;
5850                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5851                         }else
5852                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5853                     }
5854             }
5855             for(list=0; list<h->list_count; list++){
5856                 for(i=0; i<2; i++){
5857                     if(IS_DIR(mb_type, i, list)){
5858                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5859                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5860                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5861
5862                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5863                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5864                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5865                     }else{
5866                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5867                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5868                     }
5869                 }
5870             }
5871         }
5872     }
5873
5874    if( IS_INTER( mb_type ) ) {
5875         h->chroma_pred_mode_table[mb_xy] = 0;
5876         write_back_motion( h, mb_type );
5877    }
5878
5879     if( !IS_INTRA16x16( mb_type ) ) {
5880         cbp  = decode_cabac_mb_cbp_luma( h );
5881         if(CHROMA)
5882             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5883     }
5884
5885     h->cbp_table[mb_xy] = h->cbp = cbp;
5886
5887     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5888         if( decode_cabac_mb_transform_size( h ) )
5889             mb_type |= MB_TYPE_8x8DCT;
5890     }
5891     s->current_picture.mb_type[mb_xy]= mb_type;
5892
5893     if( cbp || IS_INTRA16x16( mb_type ) ) {
5894         const uint8_t *scan, *scan8x8, *dc_scan;
5895         const uint32_t *qmul;
5896         int dqp;
5897
5898         if(IS_INTERLACED(mb_type)){
5899             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5900             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5901             dc_scan= luma_dc_field_scan;
5902         }else{
5903             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5904             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5905             dc_scan= luma_dc_zigzag_scan;
5906         }
5907
5908         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5909         if( dqp == INT_MIN ){
5910             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5911             return -1;
5912         }
5913         s->qscale += dqp;
5914         if(((unsigned)s->qscale) > 51){
5915             if(s->qscale<0) s->qscale+= 52;
5916             else            s->qscale-= 52;
5917         }
5918         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5919         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5920
5921         if( IS_INTRA16x16( mb_type ) ) {
5922             int i;
5923             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5924             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5925
5926             if( cbp&15 ) {
5927                 qmul = h->dequant4_coeff[0][s->qscale];
5928                 for( i = 0; i < 16; i++ ) {
5929                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5930                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5931                 }
5932             } else {
5933                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5934             }
5935         } else {
5936             int i8x8, i4x4;
5937             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5938                 if( cbp & (1<<i8x8) ) {
5939                     if( IS_8x8DCT(mb_type) ) {
5940                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5941                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5942                     } else {
5943                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5944                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5945                             const int index = 4*i8x8 + i4x4;
5946                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5947 //START_TIMER
5948                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5949 //STOP_TIMER("decode_residual")
5950                         }
5951                     }
5952                 } else {
5953                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5954                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5955                 }
5956             }
5957         }
5958
5959         if( cbp&0x30 ){
5960             int c;
5961             for( c = 0; c < 2; c++ ) {
5962                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5963                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5964             }
5965         }
5966
5967         if( cbp&0x20 ) {
5968             int c, i;
5969             for( c = 0; c < 2; c++ ) {
5970                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5971                 for( i = 0; i < 4; i++ ) {
5972                     const int index = 16 + 4 * c + i;
5973                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5974                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5975                 }
5976             }
5977         } else {
5978             uint8_t * const nnz= &h->non_zero_count_cache[0];
5979             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5980             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5981         }
5982     } else {
5983         uint8_t * const nnz= &h->non_zero_count_cache[0];
5984         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5985         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5986         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5987         h->last_qscale_diff = 0;
5988     }
5989
5990     s->current_picture.qscale_table[mb_xy]= s->qscale;
5991     write_back_non_zero_count(h);
5992
5993     if(MB_MBAFF){
5994         h->ref_count[0] >>= 1;
5995         h->ref_count[1] >>= 1;
5996     }
5997
5998     return 0;
5999 }
6000
6001
6002 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6003     const int index_a = qp + h->slice_alpha_c0_offset;
6004     const int alpha = (alpha_table+52)[index_a];
6005     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6006     if (alpha ==0 || beta == 0) return;
6007
6008     if( bS[0] < 4 ) {
6009         int8_t tc[4];
6010         tc[0] = (tc0_table+52)[index_a][bS[0]];
6011         tc[1] = (tc0_table+52)[index_a][bS[1]];
6012         tc[2] = (tc0_table+52)[index_a][bS[2]];
6013         tc[3] = (tc0_table+52)[index_a][bS[3]];
6014         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6015     } else {
6016         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
6017     }
6018 }
6019 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6020     const int index_a = qp + h->slice_alpha_c0_offset;
6021     const int alpha = (alpha_table+52)[index_a];
6022     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6023     if (alpha ==0 || beta == 0) return;
6024
6025     if( bS[0] < 4 ) {
6026         int8_t tc[4];
6027         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6028         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6029         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6030         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6031         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6032     } else {
6033         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6034     }
6035 }
6036
6037 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6038     int i;
6039     for( i = 0; i < 16; i++, pix += stride) {
6040         int index_a;
6041         int alpha;
6042         int beta;
6043
6044         int qp_index;
6045         int bS_index = (i >> 1);
6046         if (!MB_FIELD) {
6047             bS_index &= ~1;
6048             bS_index |= (i & 1);
6049         }
6050
6051         if( bS[bS_index] == 0 ) {
6052             continue;
6053         }
6054
6055         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6056         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6057         alpha = (alpha_table+52)[index_a];
6058         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6059
6060         if( bS[bS_index] < 4 ) {
6061             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6062             const int p0 = pix[-1];
6063             const int p1 = pix[-2];
6064             const int p2 = pix[-3];
6065             const int q0 = pix[0];
6066             const int q1 = pix[1];
6067             const int q2 = pix[2];
6068
6069             if( FFABS( p0 - q0 ) < alpha &&
6070                 FFABS( p1 - p0 ) < beta &&
6071                 FFABS( q1 - q0 ) < beta ) {
6072                 int tc = tc0;
6073                 int i_delta;
6074
6075                 if( FFABS( p2 - p0 ) < beta ) {
6076                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6077                     tc++;
6078                 }
6079                 if( FFABS( q2 - q0 ) < beta ) {
6080                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6081                     tc++;
6082                 }
6083
6084                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6085                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6086                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6087                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6088             }
6089         }else{
6090             const int p0 = pix[-1];
6091             const int p1 = pix[-2];
6092             const int p2 = pix[-3];
6093
6094             const int q0 = pix[0];
6095             const int q1 = pix[1];
6096             const int q2 = pix[2];
6097
6098             if( FFABS( p0 - q0 ) < alpha &&
6099                 FFABS( p1 - p0 ) < beta &&
6100                 FFABS( q1 - q0 ) < beta ) {
6101
6102                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6103                     if( FFABS( p2 - p0 ) < beta)
6104                     {
6105                         const int p3 = pix[-4];
6106                         /* p0', p1', p2' */
6107                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6108                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6109                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6110                     } else {
6111                         /* p0' */
6112                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6113                     }
6114                     if( FFABS( q2 - q0 ) < beta)
6115                     {
6116                         const int q3 = pix[3];
6117                         /* q0', q1', q2' */
6118                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6119                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6120                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6121                     } else {
6122                         /* q0' */
6123                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6124                     }
6125                 }else{
6126                     /* p0', q0' */
6127                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6128                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6129                 }
6130                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6131             }
6132         }
6133     }
6134 }
6135 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6136     int i;
6137     for( i = 0; i < 8; i++, pix += stride) {
6138         int index_a;
6139         int alpha;
6140         int beta;
6141
6142         int qp_index;
6143         int bS_index = i;
6144
6145         if( bS[bS_index] == 0 ) {
6146             continue;
6147         }
6148
6149         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6150         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6151         alpha = (alpha_table+52)[index_a];
6152         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6153
6154         if( bS[bS_index] < 4 ) {
6155             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6156             const int p0 = pix[-1];
6157             const int p1 = pix[-2];
6158             const int q0 = pix[0];
6159             const int q1 = pix[1];
6160
6161             if( FFABS( p0 - q0 ) < alpha &&
6162                 FFABS( p1 - p0 ) < beta &&
6163                 FFABS( q1 - q0 ) < beta ) {
6164                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6165
6166                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6167                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6168                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6169             }
6170         }else{
6171             const int p0 = pix[-1];
6172             const int p1 = pix[-2];
6173             const int q0 = pix[0];
6174             const int q1 = pix[1];
6175
6176             if( FFABS( p0 - q0 ) < alpha &&
6177                 FFABS( p1 - p0 ) < beta &&
6178                 FFABS( q1 - q0 ) < beta ) {
6179
6180                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6181                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6182                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6183             }
6184         }
6185     }
6186 }
6187
6188 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6189     const int index_a = qp + h->slice_alpha_c0_offset;
6190     const int alpha = (alpha_table+52)[index_a];
6191     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6192     if (alpha ==0 || beta == 0) return;
6193
6194     if( bS[0] < 4 ) {
6195         int8_t tc[4];
6196         tc[0] = (tc0_table+52)[index_a][bS[0]];
6197         tc[1] = (tc0_table+52)[index_a][bS[1]];
6198         tc[2] = (tc0_table+52)[index_a][bS[2]];
6199         tc[3] = (tc0_table+52)[index_a][bS[3]];
6200         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6201     } else {
6202         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6203     }
6204 }
6205
6206 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6207     const int index_a = qp + h->slice_alpha_c0_offset;
6208     const int alpha = (alpha_table+52)[index_a];
6209     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6210     if (alpha ==0 || beta == 0) return;
6211
6212     if( bS[0] < 4 ) {
6213         int8_t tc[4];
6214         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6215         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6216         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6217         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6218         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6219     } else {
6220         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6221     }
6222 }
6223
6224 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6225     MpegEncContext * const s = &h->s;
6226     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6227     int mb_xy, mb_type;
6228     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6229
6230     mb_xy = h->mb_xy;
6231
6232     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6233         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6234        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6235                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6236         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6237         return;
6238     }
6239     assert(!FRAME_MBAFF);
6240
6241     mb_type = s->current_picture.mb_type[mb_xy];
6242     qp = s->current_picture.qscale_table[mb_xy];
6243     qp0 = s->current_picture.qscale_table[mb_xy-1];
6244     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6245     qpc = get_chroma_qp( h, 0, qp );
6246     qpc0 = get_chroma_qp( h, 0, qp0 );
6247     qpc1 = get_chroma_qp( h, 0, qp1 );
6248     qp0 = (qp + qp0 + 1) >> 1;
6249     qp1 = (qp + qp1 + 1) >> 1;
6250     qpc0 = (qpc + qpc0 + 1) >> 1;
6251     qpc1 = (qpc + qpc1 + 1) >> 1;
6252     qp_thresh = 15 - h->slice_alpha_c0_offset;
6253     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6254        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6255         return;
6256
6257     if( IS_INTRA(mb_type) ) {
6258         int16_t bS4[4] = {4,4,4,4};
6259         int16_t bS3[4] = {3,3,3,3};
6260         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6261         if( IS_8x8DCT(mb_type) ) {
6262             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6263             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6264             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6265             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6266         } else {
6267             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6268             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6269             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6270             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6271             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6272             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6273             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6274             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6275         }
6276         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6277         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6278         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6279         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6280         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6281         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6282         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6283         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6284         return;
6285     } else {
6286         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6287         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6288         int edges;
6289         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6290             edges = 4;
6291             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6292         } else {
6293             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6294                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6295             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6296                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6297                              ? 3 : 0;
6298             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6299             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6300             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6301                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6302         }
6303         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6304             bSv[0][0] = 0x0004000400040004ULL;
6305         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6306             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6307
6308 #define FILTER(hv,dir,edge)\
6309         if(bSv[dir][edge]) {\
6310             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6311             if(!(edge&1)) {\
6312                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6313                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6314             }\
6315         }
6316         if( edges == 1 ) {
6317             FILTER(v,0,0);
6318             FILTER(h,1,0);
6319         } else if( IS_8x8DCT(mb_type) ) {
6320             FILTER(v,0,0);
6321             FILTER(v,0,2);
6322             FILTER(h,1,0);
6323             FILTER(h,1,2);
6324         } else {
6325             FILTER(v,0,0);
6326             FILTER(v,0,1);
6327             FILTER(v,0,2);
6328             FILTER(v,0,3);
6329             FILTER(h,1,0);
6330             FILTER(h,1,1);
6331             FILTER(h,1,2);
6332             FILTER(h,1,3);
6333         }
6334 #undef FILTER
6335     }
6336 }
6337
6338
6339 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6340     MpegEncContext * const s = &h->s;
6341     int edge;
6342     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6343     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6344     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6345     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6346     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6347
6348     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6349                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6350     // how often to recheck mv-based bS when iterating between edges
6351     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6352                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6353     // how often to recheck mv-based bS when iterating along each edge
6354     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6355
6356     if (first_vertical_edge_done) {
6357         start = 1;
6358     }
6359
6360     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6361         start = 1;
6362
6363     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6364         && !IS_INTERLACED(mb_type)
6365         && IS_INTERLACED(mbm_type)
6366         ) {
6367         // This is a special case in the norm where the filtering must
6368         // be done twice (one each of the field) even if we are in a
6369         // frame macroblock.
6370         //
6371         static const int nnz_idx[4] = {4,5,6,3};
6372         unsigned int tmp_linesize   = 2 *   linesize;
6373         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6374         int mbn_xy = mb_xy - 2 * s->mb_stride;
6375         int qp;
6376         int i, j;
6377         int16_t bS[4];
6378
6379         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6380             if( IS_INTRA(mb_type) ||
6381                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6382                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6383             } else {
6384                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6385                 for( i = 0; i < 4; i++ ) {
6386                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6387                         mbn_nnz[nnz_idx[i]] != 0 )
6388                         bS[i] = 2;
6389                     else
6390                         bS[i] = 1;
6391                 }
6392             }
6393             // Do not use s->qscale as luma quantizer because it has not the same
6394             // value in IPCM macroblocks.
6395             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6396             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6397             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6398             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6399             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6400                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6401             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6402                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6403         }
6404
6405         start = 1;
6406     }
6407
6408     /* Calculate bS */
6409     for( edge = start; edge < edges; edge++ ) {
6410         /* mbn_xy: neighbor macroblock */
6411         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6412         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6413         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6414         int16_t bS[4];
6415         int qp;
6416
6417         if( (edge&1) && IS_8x8DCT(mb_type) )
6418             continue;
6419
6420         if( IS_INTRA(mb_type) ||
6421             IS_INTRA(mbn_type) ) {
6422             int value;
6423             if (edge == 0) {
6424                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6425                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6426                 ) {
6427                     value = 4;
6428                 } else {
6429                     value = 3;
6430                 }
6431             } else {
6432                 value = 3;
6433             }
6434             bS[0] = bS[1] = bS[2] = bS[3] = value;
6435         } else {
6436             int i, l;
6437             int mv_done;
6438
6439             if( edge & mask_edge ) {
6440                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6441                 mv_done = 1;
6442             }
6443             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6444                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6445                 mv_done = 1;
6446             }
6447             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6448                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6449                 int bn_idx= b_idx - (dir ? 8:1);
6450                 int v = 0;
6451
6452                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6453                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6454                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6455                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6456                 }
6457
6458                 if(h->slice_type_nos == FF_B_TYPE && v){
6459                     v=0;
6460                     for( l = 0; !v && l < 2; l++ ) {
6461                         int ln= 1-l;
6462                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6463                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6464                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6465                     }
6466                 }
6467
6468                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6469                 mv_done = 1;
6470             }
6471             else
6472                 mv_done = 0;
6473
6474             for( i = 0; i < 4; i++ ) {
6475                 int x = dir == 0 ? edge : i;
6476                 int y = dir == 0 ? i    : edge;
6477                 int b_idx= 8 + 4 + x + 8*y;
6478                 int bn_idx= b_idx - (dir ? 8:1);
6479
6480                 if( h->non_zero_count_cache[b_idx] |
6481                     h->non_zero_count_cache[bn_idx] ) {
6482                     bS[i] = 2;
6483                 }
6484                 else if(!mv_done)
6485                 {
6486                     bS[i] = 0;
6487                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6488                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6489                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6490                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6491                             bS[i] = 1;
6492                             break;
6493                         }
6494                     }
6495
6496                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6497                         bS[i] = 0;
6498                         for( l = 0; l < 2; l++ ) {
6499                             int ln= 1-l;
6500                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6501                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6502                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6503                                 bS[i] = 1;
6504                                 break;
6505                             }
6506                         }
6507                     }
6508                 }
6509             }
6510
6511             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6512                 continue;
6513         }
6514
6515         /* Filter edge */
6516         // Do not use s->qscale as luma quantizer because it has not the same
6517         // value in IPCM macroblocks.
6518         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6519         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6520         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6521         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6522         if( dir == 0 ) {
6523             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6524             if( (edge&1) == 0 ) {
6525                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6526                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6527                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6528                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6529             }
6530         } else {
6531             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6532             if( (edge&1) == 0 ) {
6533                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6534                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6535                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6536                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6537             }
6538         }
6539     }
6540 }
6541
6542 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6543     MpegEncContext * const s = &h->s;
6544     const int mb_xy= mb_x + mb_y*s->mb_stride;
6545     const int mb_type = s->current_picture.mb_type[mb_xy];
6546     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6547     int first_vertical_edge_done = 0;
6548     av_unused int dir;
6549
6550     //for sufficiently low qp, filtering wouldn't do anything
6551     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6552     if(!FRAME_MBAFF){
6553         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6554         int qp = s->current_picture.qscale_table[mb_xy];
6555         if(qp <= qp_thresh
6556            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6557            && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6558             return;
6559         }
6560     }
6561
6562     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6563     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6564         int top_type, left_type[2];
6565         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6566         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6567         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6568
6569         if(IS_8x8DCT(top_type)){
6570             h->non_zero_count_cache[4+8*0]=
6571             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6572             h->non_zero_count_cache[6+8*0]=
6573             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6574         }
6575         if(IS_8x8DCT(left_type[0])){
6576             h->non_zero_count_cache[3+8*1]=
6577             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6578         }
6579         if(IS_8x8DCT(left_type[1])){
6580             h->non_zero_count_cache[3+8*3]=
6581             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6582         }
6583
6584         if(IS_8x8DCT(mb_type)){
6585             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6586             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6587
6588             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6589             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6590
6591             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6592             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6593
6594             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6595             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6596         }
6597     }
6598
6599     if (FRAME_MBAFF
6600             // left mb is in picture
6601             && h->slice_table[mb_xy-1] != 0xFFFF
6602             // and current and left pair do not have the same interlaced type
6603             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6604             // and left mb is in the same slice if deblocking_filter == 2
6605             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6606         /* First vertical edge is different in MBAFF frames
6607          * There are 8 different bS to compute and 2 different Qp
6608          */
6609         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6610         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6611         int16_t bS[8];
6612         int qp[2];
6613         int bqp[2];
6614         int rqp[2];
6615         int mb_qp, mbn0_qp, mbn1_qp;
6616         int i;
6617         first_vertical_edge_done = 1;
6618
6619         if( IS_INTRA(mb_type) )
6620             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6621         else {
6622             for( i = 0; i < 8; i++ ) {
6623                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6624
6625                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6626                     bS[i] = 4;
6627                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6628                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6629                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6630                                                                        :
6631                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6632                     bS[i] = 2;
6633                 else
6634                     bS[i] = 1;
6635             }
6636         }
6637
6638         mb_qp = s->current_picture.qscale_table[mb_xy];
6639         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6640         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6641         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6642         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6643                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6644         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6645                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6646         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6647         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6648                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6649         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6650                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6651
6652         /* Filter edge */
6653         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6654         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6655         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6656         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6657         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6658     }
6659
6660 #if CONFIG_SMALL
6661     for( dir = 0; dir < 2; dir++ )
6662         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6663 #else
6664     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6665     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6666 #endif
6667 }
6668
6669 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6670     H264Context *h = *(void**)arg;
6671     MpegEncContext * const s = &h->s;
6672     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6673
6674     s->mb_skip_run= -1;
6675
6676     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6677                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6678
6679     if( h->pps.cabac ) {
6680         int i;
6681
6682         /* realign */
6683         align_get_bits( &s->gb );
6684
6685         /* init cabac */
6686         ff_init_cabac_states( &h->cabac);
6687         ff_init_cabac_decoder( &h->cabac,
6688                                s->gb.buffer + get_bits_count(&s->gb)/8,
6689                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6690         /* calculate pre-state */
6691         for( i= 0; i < 460; i++ ) {
6692             int pre;
6693             if( h->slice_type_nos == FF_I_TYPE )
6694                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6695             else
6696                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6697
6698             if( pre <= 63 )
6699                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6700             else
6701                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6702         }
6703
6704         for(;;){
6705 //START_TIMER
6706             int ret = decode_mb_cabac(h);
6707             int eos;
6708 //STOP_TIMER("decode_mb_cabac")
6709
6710             if(ret>=0) hl_decode_mb(h);
6711
6712             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6713                 s->mb_y++;
6714
6715                 ret = decode_mb_cabac(h);
6716
6717                 if(ret>=0) hl_decode_mb(h);
6718                 s->mb_y--;
6719             }
6720             eos = get_cabac_terminate( &h->cabac );
6721
6722             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6723                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6724                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6725                 return -1;
6726             }
6727
6728             if( ++s->mb_x >= s->mb_width ) {
6729                 s->mb_x = 0;
6730                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6731                 ++s->mb_y;
6732                 if(FIELD_OR_MBAFF_PICTURE) {
6733                     ++s->mb_y;
6734                 }
6735             }
6736
6737             if( eos || s->mb_y >= s->mb_height ) {
6738                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6739                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6740                 return 0;
6741             }
6742         }
6743
6744     } else {
6745         for(;;){
6746             int ret = decode_mb_cavlc(h);
6747
6748             if(ret>=0) hl_decode_mb(h);
6749
6750             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6751                 s->mb_y++;
6752                 ret = decode_mb_cavlc(h);
6753
6754                 if(ret>=0) hl_decode_mb(h);
6755                 s->mb_y--;
6756             }
6757
6758             if(ret<0){
6759                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6760                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6761
6762                 return -1;
6763             }
6764
6765             if(++s->mb_x >= s->mb_width){
6766                 s->mb_x=0;
6767                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6768                 ++s->mb_y;
6769                 if(FIELD_OR_MBAFF_PICTURE) {
6770                     ++s->mb_y;
6771                 }
6772                 if(s->mb_y >= s->mb_height){
6773                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6774
6775                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6776                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6777
6778                         return 0;
6779                     }else{
6780                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6781
6782                         return -1;
6783                     }
6784                 }
6785             }
6786
6787             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6788                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6789                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6790                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6791
6792                     return 0;
6793                 }else{
6794                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6795
6796                     return -1;
6797                 }
6798             }
6799         }
6800     }
6801
6802 #if 0
6803     for(;s->mb_y < s->mb_height; s->mb_y++){
6804         for(;s->mb_x < s->mb_width; s->mb_x++){
6805             int ret= decode_mb(h);
6806
6807             hl_decode_mb(h);
6808
6809             if(ret<0){
6810                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6811                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6812
6813                 return -1;
6814             }
6815
6816             if(++s->mb_x >= s->mb_width){
6817                 s->mb_x=0;
6818                 if(++s->mb_y >= s->mb_height){
6819                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6820                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6821
6822                         return 0;
6823                     }else{
6824                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6825
6826                         return -1;
6827                     }
6828                 }
6829             }
6830
6831             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6832                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6833                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6834
6835                     return 0;
6836                 }else{
6837                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6838
6839                     return -1;
6840                 }
6841             }
6842         }
6843         s->mb_x=0;
6844         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6845     }
6846 #endif
6847     return -1; //not reached
6848 }
6849
6850 static int decode_picture_timing(H264Context *h){
6851     MpegEncContext * const s = &h->s;
6852     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6853         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6854         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6855     }
6856     if(h->sps.pic_struct_present_flag){
6857         unsigned int i, num_clock_ts;
6858         h->sei_pic_struct = get_bits(&s->gb, 4);
6859         h->sei_ct_type    = 0;
6860
6861         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6862             return -1;
6863
6864         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6865
6866         for (i = 0 ; i < num_clock_ts ; i++){
6867             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6868                 unsigned int full_timestamp_flag;
6869                 h->sei_ct_type |= 1<<get_bits(&s->gb, 2);
6870                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6871                 skip_bits(&s->gb, 5);                 /* counting_type */
6872                 full_timestamp_flag = get_bits(&s->gb, 1);
6873                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6874                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6875                 skip_bits(&s->gb, 8);                 /* n_frames */
6876                 if(full_timestamp_flag){
6877                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6878                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6879                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6880                 }else{
6881                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6882                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6883                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6884                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6885                             if(get_bits(&s->gb, 1))   /* hours_flag */
6886                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6887                         }
6888                     }
6889                 }
6890                 if(h->sps.time_offset_length > 0)
6891                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6892             }
6893         }
6894     }
6895     return 0;
6896 }
6897
6898 static int decode_unregistered_user_data(H264Context *h, int size){
6899     MpegEncContext * const s = &h->s;
6900     uint8_t user_data[16+256];
6901     int e, build, i;
6902
6903     if(size<16)
6904         return -1;
6905
6906     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6907         user_data[i]= get_bits(&s->gb, 8);
6908     }
6909
6910     user_data[i]= 0;
6911     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6912     if(e==1 && build>=0)
6913         h->x264_build= build;
6914
6915     if(s->avctx->debug & FF_DEBUG_BUGS)
6916         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6917
6918     for(; i<size; i++)
6919         skip_bits(&s->gb, 8);
6920
6921     return 0;
6922 }
6923
6924 static int decode_recovery_point(H264Context *h){
6925     MpegEncContext * const s = &h->s;
6926
6927     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6928     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6929
6930     return 0;
6931 }
6932
6933 static int decode_buffering_period(H264Context *h){
6934     MpegEncContext * const s = &h->s;
6935     unsigned int sps_id;
6936     int sched_sel_idx;
6937     SPS *sps;
6938
6939     sps_id = get_ue_golomb_31(&s->gb);
6940     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6941         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6942         return -1;
6943     }
6944     sps = h->sps_buffers[sps_id];
6945
6946     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6947     if (sps->nal_hrd_parameters_present_flag) {
6948         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6949             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6950             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6951         }
6952     }
6953     if (sps->vcl_hrd_parameters_present_flag) {
6954         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6955             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6956             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6957         }
6958     }
6959
6960     h->sei_buffering_period_present = 1;
6961     return 0;
6962 }
6963
6964 int ff_h264_decode_sei(H264Context *h){
6965     MpegEncContext * const s = &h->s;
6966
6967     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6968         int size, type;
6969
6970         type=0;
6971         do{
6972             type+= show_bits(&s->gb, 8);
6973         }while(get_bits(&s->gb, 8) == 255);
6974
6975         size=0;
6976         do{
6977             size+= show_bits(&s->gb, 8);
6978         }while(get_bits(&s->gb, 8) == 255);
6979
6980         switch(type){
6981         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6982             if(decode_picture_timing(h) < 0)
6983                 return -1;
6984             break;
6985         case SEI_TYPE_USER_DATA_UNREGISTERED:
6986             if(decode_unregistered_user_data(h, size) < 0)
6987                 return -1;
6988             break;
6989         case SEI_TYPE_RECOVERY_POINT:
6990             if(decode_recovery_point(h) < 0)
6991                 return -1;
6992             break;
6993         case SEI_BUFFERING_PERIOD:
6994             if(decode_buffering_period(h) < 0)
6995                 return -1;
6996             break;
6997         default:
6998             skip_bits(&s->gb, 8*size);
6999         }
7000
7001         //FIXME check bits here
7002         align_get_bits(&s->gb);
7003     }
7004
7005     return 0;
7006 }
7007
7008 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
7009     MpegEncContext * const s = &h->s;
7010     int cpb_count, i;
7011     cpb_count = get_ue_golomb_31(&s->gb) + 1;
7012
7013     if(cpb_count > 32U){
7014         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
7015         return -1;
7016     }
7017
7018     get_bits(&s->gb, 4); /* bit_rate_scale */
7019     get_bits(&s->gb, 4); /* cpb_size_scale */
7020     for(i=0; i<cpb_count; i++){
7021         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7022         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7023         get_bits1(&s->gb);     /* cbr_flag */
7024     }
7025     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7026     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7027     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
7028     sps->time_offset_length = get_bits(&s->gb, 5);
7029     sps->cpb_cnt = cpb_count;
7030     return 0;
7031 }
7032
7033 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7034     MpegEncContext * const s = &h->s;
7035     int aspect_ratio_info_present_flag;
7036     unsigned int aspect_ratio_idc;
7037
7038     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7039
7040     if( aspect_ratio_info_present_flag ) {
7041         aspect_ratio_idc= get_bits(&s->gb, 8);
7042         if( aspect_ratio_idc == EXTENDED_SAR ) {
7043             sps->sar.num= get_bits(&s->gb, 16);
7044             sps->sar.den= get_bits(&s->gb, 16);
7045         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
7046             sps->sar=  pixel_aspect[aspect_ratio_idc];
7047         }else{
7048             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7049             return -1;
7050         }
7051     }else{
7052         sps->sar.num=
7053         sps->sar.den= 0;
7054     }
7055 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7056
7057     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7058         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7059     }
7060
7061     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7062         get_bits(&s->gb, 3);    /* video_format */
7063         get_bits1(&s->gb);      /* video_full_range_flag */
7064         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7065             get_bits(&s->gb, 8); /* colour_primaries */
7066             get_bits(&s->gb, 8); /* transfer_characteristics */
7067             get_bits(&s->gb, 8); /* matrix_coefficients */
7068         }
7069     }
7070
7071     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7072         s->avctx->chroma_sample_location = get_ue_golomb(&s->gb)+1;  /* chroma_sample_location_type_top_field */
7073         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7074     }
7075
7076     sps->timing_info_present_flag = get_bits1(&s->gb);
7077     if(sps->timing_info_present_flag){
7078         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7079         sps->time_scale = get_bits_long(&s->gb, 32);
7080         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7081     }
7082
7083     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7084     if(sps->nal_hrd_parameters_present_flag)
7085         if(decode_hrd_parameters(h, sps) < 0)
7086             return -1;
7087     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7088     if(sps->vcl_hrd_parameters_present_flag)
7089         if(decode_hrd_parameters(h, sps) < 0)
7090             return -1;
7091     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7092         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7093     sps->pic_struct_present_flag = get_bits1(&s->gb);
7094
7095     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7096     if(sps->bitstream_restriction_flag){
7097         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7098         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7099         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7100         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7101         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7102         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7103         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7104
7105         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7106             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7107             return -1;
7108         }
7109     }
7110
7111     return 0;
7112 }
7113
7114 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7115                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7116     MpegEncContext * const s = &h->s;
7117     int i, last = 8, next = 8;
7118     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7119     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7120         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7121     else
7122     for(i=0;i<size;i++){
7123         if(next)
7124             next = (last + get_se_golomb(&s->gb)) & 0xff;
7125         if(!i && !next){ /* matrix not written, we use the preset one */
7126             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7127             break;
7128         }
7129         last = factors[scan[i]] = next ? next : last;
7130     }
7131 }
7132
7133 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7134                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7135     MpegEncContext * const s = &h->s;
7136     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7137     const uint8_t *fallback[4] = {
7138         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7139         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7140         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7141         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7142     };
7143     if(get_bits1(&s->gb)){
7144         sps->scaling_matrix_present |= is_sps;
7145         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7146         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7147         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7148         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7149         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7150         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7151         if(is_sps || pps->transform_8x8_mode){
7152             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7153             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7154         }
7155     }
7156 }
7157
7158 int ff_h264_decode_seq_parameter_set(H264Context *h){
7159     MpegEncContext * const s = &h->s;
7160     int profile_idc, level_idc;
7161     unsigned int sps_id;
7162     int i;
7163     SPS *sps;
7164
7165     profile_idc= get_bits(&s->gb, 8);
7166     get_bits1(&s->gb);   //constraint_set0_flag
7167     get_bits1(&s->gb);   //constraint_set1_flag
7168     get_bits1(&s->gb);   //constraint_set2_flag
7169     get_bits1(&s->gb);   //constraint_set3_flag
7170     get_bits(&s->gb, 4); // reserved
7171     level_idc= get_bits(&s->gb, 8);
7172     sps_id= get_ue_golomb_31(&s->gb);
7173
7174     if(sps_id >= MAX_SPS_COUNT) {
7175         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7176         return -1;
7177     }
7178     sps= av_mallocz(sizeof(SPS));
7179     if(sps == NULL)
7180         return -1;
7181
7182     sps->profile_idc= profile_idc;
7183     sps->level_idc= level_idc;
7184
7185     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7186     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7187     sps->scaling_matrix_present = 0;
7188
7189     if(sps->profile_idc >= 100){ //high profile
7190         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7191         if(sps->chroma_format_idc == 3)
7192             sps->residual_color_transform_flag = get_bits1(&s->gb);
7193         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7194         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7195         sps->transform_bypass = get_bits1(&s->gb);
7196         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7197     }else{
7198         sps->chroma_format_idc= 1;
7199     }
7200
7201     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7202     sps->poc_type= get_ue_golomb_31(&s->gb);
7203
7204     if(sps->poc_type == 0){ //FIXME #define
7205         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7206     } else if(sps->poc_type == 1){//FIXME #define
7207         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7208         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7209         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7210         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7211
7212         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7213             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7214             goto fail;
7215         }
7216
7217         for(i=0; i<sps->poc_cycle_length; i++)
7218             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7219     }else if(sps->poc_type != 2){
7220         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7221         goto fail;
7222     }
7223
7224     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7225     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7226         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7227         goto fail;
7228     }
7229     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7230     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7231     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7232     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7233        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7234         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7235         goto fail;
7236     }
7237
7238     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7239     if(!sps->frame_mbs_only_flag)
7240         sps->mb_aff= get_bits1(&s->gb);
7241     else
7242         sps->mb_aff= 0;
7243
7244     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7245
7246 #ifndef ALLOW_INTERLACE
7247     if(sps->mb_aff)
7248         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7249 #endif
7250     sps->crop= get_bits1(&s->gb);
7251     if(sps->crop){
7252         sps->crop_left  = get_ue_golomb(&s->gb);
7253         sps->crop_right = get_ue_golomb(&s->gb);
7254         sps->crop_top   = get_ue_golomb(&s->gb);
7255         sps->crop_bottom= get_ue_golomb(&s->gb);
7256         if(sps->crop_left || sps->crop_top){
7257             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7258         }
7259         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7260             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7261         }
7262     }else{
7263         sps->crop_left  =
7264         sps->crop_right =
7265         sps->crop_top   =
7266         sps->crop_bottom= 0;
7267     }
7268
7269     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7270     if( sps->vui_parameters_present_flag )
7271         decode_vui_parameters(h, sps);
7272
7273     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7274         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n",
7275                sps_id, sps->profile_idc, sps->level_idc,
7276                sps->poc_type,
7277                sps->ref_frame_count,
7278                sps->mb_width, sps->mb_height,
7279                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7280                sps->direct_8x8_inference_flag ? "8B8" : "",
7281                sps->crop_left, sps->crop_right,
7282                sps->crop_top, sps->crop_bottom,
7283                sps->vui_parameters_present_flag ? "VUI" : "",
7284                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc],
7285                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
7286                sps->timing_info_present_flag ? sps->time_scale : 0
7287                );
7288     }
7289
7290     av_free(h->sps_buffers[sps_id]);
7291     h->sps_buffers[sps_id]= sps;
7292     h->sps = *sps;
7293     return 0;
7294 fail:
7295     av_free(sps);
7296     return -1;
7297 }
7298
7299 static void
7300 build_qp_table(PPS *pps, int t, int index)
7301 {
7302     int i;
7303     for(i = 0; i < 52; i++)
7304         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7305 }
7306
7307 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7308     MpegEncContext * const s = &h->s;
7309     unsigned int pps_id= get_ue_golomb(&s->gb);
7310     PPS *pps;
7311
7312     if(pps_id >= MAX_PPS_COUNT) {
7313         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7314         return -1;
7315     }
7316
7317     pps= av_mallocz(sizeof(PPS));
7318     if(pps == NULL)
7319         return -1;
7320     pps->sps_id= get_ue_golomb_31(&s->gb);
7321     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7322         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7323         goto fail;
7324     }
7325
7326     pps->cabac= get_bits1(&s->gb);
7327     pps->pic_order_present= get_bits1(&s->gb);
7328     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7329     if(pps->slice_group_count > 1 ){
7330         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7331         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7332         switch(pps->mb_slice_group_map_type){
7333         case 0:
7334 #if 0
7335 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7336 |    run_length[ i ]                                |1  |ue(v)   |
7337 #endif
7338             break;
7339         case 2:
7340 #if 0
7341 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7342 |{                                                  |   |        |
7343 |    top_left_mb[ i ]                               |1  |ue(v)   |
7344 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7345 |   }                                               |   |        |
7346 #endif
7347             break;
7348         case 3:
7349         case 4:
7350         case 5:
7351 #if 0
7352 |   slice_group_change_direction_flag               |1  |u(1)    |
7353 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7354 #endif
7355             break;
7356         case 6:
7357 #if 0
7358 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7359 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7360 |)                                                  |   |        |
7361 |    slice_group_id[ i ]                            |1  |u(v)    |
7362 #endif
7363             break;
7364         }
7365     }
7366     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7367     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7368     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7369         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7370         goto fail;
7371     }
7372
7373     pps->weighted_pred= get_bits1(&s->gb);
7374     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7375     pps->init_qp= get_se_golomb(&s->gb) + 26;
7376     pps->init_qs= get_se_golomb(&s->gb) + 26;
7377     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7378     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7379     pps->constrained_intra_pred= get_bits1(&s->gb);
7380     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7381
7382     pps->transform_8x8_mode= 0;
7383     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7384     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7385     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7386
7387     if(get_bits_count(&s->gb) < bit_length){
7388         pps->transform_8x8_mode= get_bits1(&s->gb);
7389         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7390         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7391     } else {
7392         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7393     }
7394
7395     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7396     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7397     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7398         h->pps.chroma_qp_diff= 1;
7399
7400     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7401         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7402                pps_id, pps->sps_id,
7403                pps->cabac ? "CABAC" : "CAVLC",
7404                pps->slice_group_count,
7405                pps->ref_count[0], pps->ref_count[1],
7406                pps->weighted_pred ? "weighted" : "",
7407                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7408                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7409                pps->constrained_intra_pred ? "CONSTR" : "",
7410                pps->redundant_pic_cnt_present ? "REDU" : "",
7411                pps->transform_8x8_mode ? "8x8DCT" : ""
7412                );
7413     }
7414
7415     av_free(h->pps_buffers[pps_id]);
7416     h->pps_buffers[pps_id]= pps;
7417     return 0;
7418 fail:
7419     av_free(pps);
7420     return -1;
7421 }
7422
7423 /**
7424  * Call decode_slice() for each context.
7425  *
7426  * @param h h264 master context
7427  * @param context_count number of contexts to execute
7428  */
7429 static void execute_decode_slices(H264Context *h, int context_count){
7430     MpegEncContext * const s = &h->s;
7431     AVCodecContext * const avctx= s->avctx;
7432     H264Context *hx;
7433     int i;
7434
7435     if (s->avctx->hwaccel)
7436         return;
7437     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7438         return;
7439     if(context_count == 1) {
7440         decode_slice(avctx, &h);
7441     } else {
7442         for(i = 1; i < context_count; i++) {
7443             hx = h->thread_context[i];
7444             hx->s.error_recognition = avctx->error_recognition;
7445             hx->s.error_count = 0;
7446         }
7447
7448         avctx->execute(avctx, (void *)decode_slice,
7449                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7450
7451         /* pull back stuff from slices to master context */
7452         hx = h->thread_context[context_count - 1];
7453         s->mb_x = hx->s.mb_x;
7454         s->mb_y = hx->s.mb_y;
7455         s->dropable = hx->s.dropable;
7456         s->picture_structure = hx->s.picture_structure;
7457         for(i = 1; i < context_count; i++)
7458             h->s.error_count += h->thread_context[i]->s.error_count;
7459     }
7460 }
7461
7462
7463 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7464     MpegEncContext * const s = &h->s;
7465     AVCodecContext * const avctx= s->avctx;
7466     int buf_index=0;
7467     H264Context *hx; ///< thread context
7468     int context_count = 0;
7469     int next_avc= h->is_avc ? 0 : buf_size;
7470
7471     h->max_contexts = avctx->thread_count;
7472 #if 0
7473     int i;
7474     for(i=0; i<50; i++){
7475         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7476     }
7477 #endif
7478     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7479         h->current_slice = 0;
7480         if (!s->first_field)
7481             s->current_picture_ptr= NULL;
7482         reset_sei(h);
7483     }
7484
7485     for(;;){
7486         int consumed;
7487         int dst_length;
7488         int bit_length;
7489         const uint8_t *ptr;
7490         int i, nalsize = 0;
7491         int err;
7492
7493         if(buf_index >= next_avc) {
7494             if(buf_index >= buf_size) break;
7495             nalsize = 0;
7496             for(i = 0; i < h->nal_length_size; i++)
7497                 nalsize = (nalsize << 8) | buf[buf_index++];
7498             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7499                 if(nalsize == 1){
7500                     buf_index++;
7501                     continue;
7502                 }else{
7503                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7504                     break;
7505                 }
7506             }
7507             next_avc= buf_index + nalsize;
7508         } else {
7509             // start code prefix search
7510             for(; buf_index + 3 < buf_size; buf_index++){
7511                 // This should always succeed in the first iteration.
7512                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7513                     break;
7514             }
7515
7516             if(buf_index+3 >= buf_size) break;
7517
7518             buf_index+=3;
7519         }
7520
7521         hx = h->thread_context[context_count];
7522
7523         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, next_avc - buf_index);
7524         if (ptr==NULL || dst_length < 0){
7525             return -1;
7526         }
7527         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7528             dst_length--;
7529         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7530
7531         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7532             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7533         }
7534
7535         if (h->is_avc && (nalsize != consumed) && nalsize){
7536             int i, debug_level = AV_LOG_DEBUG;
7537             for (i = consumed; i < nalsize; i++)
7538                 if (buf[buf_index+i])
7539                     debug_level = AV_LOG_ERROR;
7540             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7541         }
7542
7543         buf_index += consumed;
7544
7545         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7546            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7547             continue;
7548
7549       again:
7550         err = 0;
7551         switch(hx->nal_unit_type){
7552         case NAL_IDR_SLICE:
7553             if (h->nal_unit_type != NAL_IDR_SLICE) {
7554                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7555                 return -1;
7556             }
7557             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7558         case NAL_SLICE:
7559             init_get_bits(&hx->s.gb, ptr, bit_length);
7560             hx->intra_gb_ptr=
7561             hx->inter_gb_ptr= &hx->s.gb;
7562             hx->s.data_partitioning = 0;
7563
7564             if((err = decode_slice_header(hx, h)))
7565                break;
7566
7567             if (s->avctx->hwaccel && h->current_slice == 1) {
7568                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7569                     return -1;
7570             }
7571
7572             s->current_picture_ptr->key_frame |=
7573                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7574                     (h->sei_recovery_frame_cnt >= 0);
7575             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7576                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7577                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7578                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7579                && avctx->skip_frame < AVDISCARD_ALL){
7580                 if(avctx->hwaccel) {
7581                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7582                         return -1;
7583                 }else
7584                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7585                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7586                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7587                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7588                 }else
7589                     context_count++;
7590             }
7591             break;
7592         case NAL_DPA:
7593             init_get_bits(&hx->s.gb, ptr, bit_length);
7594             hx->intra_gb_ptr=
7595             hx->inter_gb_ptr= NULL;
7596             hx->s.data_partitioning = 1;
7597
7598             err = decode_slice_header(hx, h);
7599             break;
7600         case NAL_DPB:
7601             init_get_bits(&hx->intra_gb, ptr, bit_length);
7602             hx->intra_gb_ptr= &hx->intra_gb;
7603             break;
7604         case NAL_DPC:
7605             init_get_bits(&hx->inter_gb, ptr, bit_length);
7606             hx->inter_gb_ptr= &hx->inter_gb;
7607
7608             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7609                && s->context_initialized
7610                && s->hurry_up < 5
7611                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7612                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7613                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7614                && avctx->skip_frame < AVDISCARD_ALL)
7615                 context_count++;
7616             break;
7617         case NAL_SEI:
7618             init_get_bits(&s->gb, ptr, bit_length);
7619             ff_h264_decode_sei(h);
7620             break;
7621         case NAL_SPS:
7622             init_get_bits(&s->gb, ptr, bit_length);
7623             ff_h264_decode_seq_parameter_set(h);
7624
7625             if(s->flags& CODEC_FLAG_LOW_DELAY)
7626                 s->low_delay=1;
7627
7628             if(avctx->has_b_frames < 2)
7629                 avctx->has_b_frames= !s->low_delay;
7630             break;
7631         case NAL_PPS:
7632             init_get_bits(&s->gb, ptr, bit_length);
7633
7634             ff_h264_decode_picture_parameter_set(h, bit_length);
7635
7636             break;
7637         case NAL_AUD:
7638         case NAL_END_SEQUENCE:
7639         case NAL_END_STREAM:
7640         case NAL_FILLER_DATA:
7641         case NAL_SPS_EXT:
7642         case NAL_AUXILIARY_SLICE:
7643             break;
7644         default:
7645             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7646         }
7647
7648         if(context_count == h->max_contexts) {
7649             execute_decode_slices(h, context_count);
7650             context_count = 0;
7651         }
7652
7653         if (err < 0)
7654             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7655         else if(err == 1) {
7656             /* Slice could not be decoded in parallel mode, copy down
7657              * NAL unit stuff to context 0 and restart. Note that
7658              * rbsp_buffer is not transferred, but since we no longer
7659              * run in parallel mode this should not be an issue. */
7660             h->nal_unit_type = hx->nal_unit_type;
7661             h->nal_ref_idc   = hx->nal_ref_idc;
7662             hx = h;
7663             goto again;
7664         }
7665     }
7666     if(context_count)
7667         execute_decode_slices(h, context_count);
7668     return buf_index;
7669 }
7670
7671 /**
7672  * returns the number of bytes consumed for building the current frame
7673  */
7674 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7675         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7676         if(pos+10>buf_size) pos=buf_size; // oops ;)
7677
7678         return pos;
7679 }
7680
7681 static int decode_frame(AVCodecContext *avctx,
7682                              void *data, int *data_size,
7683                              AVPacket *avpkt)
7684 {
7685     const uint8_t *buf = avpkt->data;
7686     int buf_size = avpkt->size;
7687     H264Context *h = avctx->priv_data;
7688     MpegEncContext *s = &h->s;
7689     AVFrame *pict = data;
7690     int buf_index;
7691
7692     s->flags= avctx->flags;
7693     s->flags2= avctx->flags2;
7694
7695    /* end of stream, output what is still in the buffers */
7696     if (buf_size == 0) {
7697         Picture *out;
7698         int i, out_idx;
7699
7700 //FIXME factorize this with the output code below
7701         out = h->delayed_pic[0];
7702         out_idx = 0;
7703         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7704             if(h->delayed_pic[i]->poc < out->poc){
7705                 out = h->delayed_pic[i];
7706                 out_idx = i;
7707             }
7708
7709         for(i=out_idx; h->delayed_pic[i]; i++)
7710             h->delayed_pic[i] = h->delayed_pic[i+1];
7711
7712         if(out){
7713             *data_size = sizeof(AVFrame);
7714             *pict= *(AVFrame*)out;
7715         }
7716
7717         return 0;
7718     }
7719
7720     if(h->is_avc && !h->got_avcC) {
7721         int i, cnt, nalsize;
7722         unsigned char *p = avctx->extradata;
7723         if(avctx->extradata_size < 7) {
7724             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7725             return -1;
7726         }
7727         if(*p != 1) {
7728             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7729             return -1;
7730         }
7731         /* sps and pps in the avcC always have length coded with 2 bytes,
7732            so put a fake nal_length_size = 2 while parsing them */
7733         h->nal_length_size = 2;
7734         // Decode sps from avcC
7735         cnt = *(p+5) & 0x1f; // Number of sps
7736         p += 6;
7737         for (i = 0; i < cnt; i++) {
7738             nalsize = AV_RB16(p) + 2;
7739             if(decode_nal_units(h, p, nalsize) < 0) {
7740                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7741                 return -1;
7742             }
7743             p += nalsize;
7744         }
7745         // Decode pps from avcC
7746         cnt = *(p++); // Number of pps
7747         for (i = 0; i < cnt; i++) {
7748             nalsize = AV_RB16(p) + 2;
7749             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7750                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7751                 return -1;
7752             }
7753             p += nalsize;
7754         }
7755         // Now store right nal length size, that will be use to parse all other nals
7756         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7757         // Do not reparse avcC
7758         h->got_avcC = 1;
7759     }
7760
7761     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7762         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7763             return -1;
7764         h->got_avcC = 1;
7765     }
7766
7767     buf_index=decode_nal_units(h, buf, buf_size);
7768     if(buf_index < 0)
7769         return -1;
7770
7771     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7772         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7773         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7774         return -1;
7775     }
7776
7777     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7778         Picture *out = s->current_picture_ptr;
7779         Picture *cur = s->current_picture_ptr;
7780         int i, pics, cross_idr, out_of_order, out_idx;
7781
7782         field_end(h);
7783
7784         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7785             /* Wait for second field. */
7786             *data_size = 0;
7787
7788         } else {
7789             cur->repeat_pict = 0;
7790
7791             /* Signal interlacing information externally. */
7792             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7793             if (h->sei_ct_type)
7794                 cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
7795             else
7796                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7797
7798             if(h->sps.pic_struct_present_flag){
7799                 switch (h->sei_pic_struct)
7800                 {
7801                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7802                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7803                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7804                     // From these hints, let the applications decide if they apply deinterlacing.
7805                     cur->repeat_pict = 1;
7806                     break;
7807                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7808                     // Force progressive here, as doubling interlaced frame is a bad idea.
7809                     cur->interlaced_frame = 0;
7810                     cur->repeat_pict = 2;
7811                     break;
7812                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7813                     cur->interlaced_frame = 0;
7814                     cur->repeat_pict = 4;
7815                     break;
7816                 }
7817             }else{
7818                 /* Derive interlacing flag from used decoding process. */
7819                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7820             }
7821
7822             if (cur->field_poc[0] != cur->field_poc[1]){
7823                 /* Derive top_field_first from field pocs. */
7824                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7825             }else{
7826                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7827                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7828                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7829                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7830                         cur->top_field_first = 1;
7831                     else
7832                         cur->top_field_first = 0;
7833                 }else{
7834                     /* Most likely progressive */
7835                     cur->top_field_first = 0;
7836                 }
7837             }
7838
7839         //FIXME do something with unavailable reference frames
7840
7841             /* Sort B-frames into display order */
7842
7843             if(h->sps.bitstream_restriction_flag
7844                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7845                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7846                 s->low_delay = 0;
7847             }
7848
7849             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7850                && !h->sps.bitstream_restriction_flag){
7851                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7852                 s->low_delay= 0;
7853             }
7854
7855             pics = 0;
7856             while(h->delayed_pic[pics]) pics++;
7857
7858             assert(pics <= MAX_DELAYED_PIC_COUNT);
7859
7860             h->delayed_pic[pics++] = cur;
7861             if(cur->reference == 0)
7862                 cur->reference = DELAYED_PIC_REF;
7863
7864             out = h->delayed_pic[0];
7865             out_idx = 0;
7866             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7867                 if(h->delayed_pic[i]->poc < out->poc){
7868                     out = h->delayed_pic[i];
7869                     out_idx = i;
7870                 }
7871             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7872
7873             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7874
7875             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7876                 { }
7877             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7878                || (s->low_delay &&
7879                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7880                  || cur->pict_type == FF_B_TYPE)))
7881             {
7882                 s->low_delay = 0;
7883                 s->avctx->has_b_frames++;
7884             }
7885
7886             if(out_of_order || pics > s->avctx->has_b_frames){
7887                 out->reference &= ~DELAYED_PIC_REF;
7888                 for(i=out_idx; h->delayed_pic[i]; i++)
7889                     h->delayed_pic[i] = h->delayed_pic[i+1];
7890             }
7891             if(!out_of_order && pics > s->avctx->has_b_frames){
7892                 *data_size = sizeof(AVFrame);
7893
7894                 h->outputed_poc = out->poc;
7895                 *pict= *(AVFrame*)out;
7896             }else{
7897                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7898             }
7899         }
7900     }
7901
7902     assert(pict->data[0] || !*data_size);
7903     ff_print_debug_info(s, pict);
7904 //printf("out %d\n", (int)pict->data[0]);
7905 #if 0 //?
7906
7907     /* Return the Picture timestamp as the frame number */
7908     /* we subtract 1 because it is added on utils.c     */
7909     avctx->frame_number = s->picture_number - 1;
7910 #endif
7911     return get_consumed_bytes(s, buf_index, buf_size);
7912 }
7913 #if 0
7914 static inline void fill_mb_avail(H264Context *h){
7915     MpegEncContext * const s = &h->s;
7916     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7917
7918     if(s->mb_y){
7919         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7920         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7921         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7922     }else{
7923         h->mb_avail[0]=
7924         h->mb_avail[1]=
7925         h->mb_avail[2]= 0;
7926     }
7927     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7928     h->mb_avail[4]= 1; //FIXME move out
7929     h->mb_avail[5]= 0; //FIXME move out
7930 }
7931 #endif
7932
7933 #ifdef TEST
7934 #undef printf
7935 #undef random
7936 #define COUNT 8000
7937 #define SIZE (COUNT*40)
7938 int main(void){
7939     int i;
7940     uint8_t temp[SIZE];
7941     PutBitContext pb;
7942     GetBitContext gb;
7943 //    int int_temp[10000];
7944     DSPContext dsp;
7945     AVCodecContext avctx;
7946
7947     dsputil_init(&dsp, &avctx);
7948
7949     init_put_bits(&pb, temp, SIZE);
7950     printf("testing unsigned exp golomb\n");
7951     for(i=0; i<COUNT; i++){
7952         START_TIMER
7953         set_ue_golomb(&pb, i);
7954         STOP_TIMER("set_ue_golomb");
7955     }
7956     flush_put_bits(&pb);
7957
7958     init_get_bits(&gb, temp, 8*SIZE);
7959     for(i=0; i<COUNT; i++){
7960         int j, s;
7961
7962         s= show_bits(&gb, 24);
7963
7964         START_TIMER
7965         j= get_ue_golomb(&gb);
7966         if(j != i){
7967             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7968 //            return -1;
7969         }
7970         STOP_TIMER("get_ue_golomb");
7971     }
7972
7973
7974     init_put_bits(&pb, temp, SIZE);
7975     printf("testing signed exp golomb\n");
7976     for(i=0; i<COUNT; i++){
7977         START_TIMER
7978         set_se_golomb(&pb, i - COUNT/2);
7979         STOP_TIMER("set_se_golomb");
7980     }
7981     flush_put_bits(&pb);
7982
7983     init_get_bits(&gb, temp, 8*SIZE);
7984     for(i=0; i<COUNT; i++){
7985         int j, s;
7986
7987         s= show_bits(&gb, 24);
7988
7989         START_TIMER
7990         j= get_se_golomb(&gb);
7991         if(j != i - COUNT/2){
7992             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7993 //            return -1;
7994         }
7995         STOP_TIMER("get_se_golomb");
7996     }
7997
7998 #if 0
7999     printf("testing 4x4 (I)DCT\n");
8000
8001     DCTELEM block[16];
8002     uint8_t src[16], ref[16];
8003     uint64_t error= 0, max_error=0;
8004
8005     for(i=0; i<COUNT; i++){
8006         int j;
8007 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8008         for(j=0; j<16; j++){
8009             ref[j]= random()%255;
8010             src[j]= random()%255;
8011         }
8012
8013         h264_diff_dct_c(block, src, ref, 4);
8014
8015         //normalize
8016         for(j=0; j<16; j++){
8017 //            printf("%d ", block[j]);
8018             block[j]= block[j]*4;
8019             if(j&1) block[j]= (block[j]*4 + 2)/5;
8020             if(j&4) block[j]= (block[j]*4 + 2)/5;
8021         }
8022 //        printf("\n");
8023
8024         s->dsp.h264_idct_add(ref, block, 4);
8025 /*        for(j=0; j<16; j++){
8026             printf("%d ", ref[j]);
8027         }
8028         printf("\n");*/
8029
8030         for(j=0; j<16; j++){
8031             int diff= FFABS(src[j] - ref[j]);
8032
8033             error+= diff*diff;
8034             max_error= FFMAX(max_error, diff);
8035         }
8036     }
8037     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8038     printf("testing quantizer\n");
8039     for(qp=0; qp<52; qp++){
8040         for(i=0; i<16; i++)
8041             src1_block[i]= src2_block[i]= random()%255;
8042
8043     }
8044     printf("Testing NAL layer\n");
8045
8046     uint8_t bitstream[COUNT];
8047     uint8_t nal[COUNT*2];
8048     H264Context h;
8049     memset(&h, 0, sizeof(H264Context));
8050
8051     for(i=0; i<COUNT; i++){
8052         int zeros= i;
8053         int nal_length;
8054         int consumed;
8055         int out_length;
8056         uint8_t *out;
8057         int j;
8058
8059         for(j=0; j<COUNT; j++){
8060             bitstream[j]= (random() % 255) + 1;
8061         }
8062
8063         for(j=0; j<zeros; j++){
8064             int pos= random() % COUNT;
8065             while(bitstream[pos] == 0){
8066                 pos++;
8067                 pos %= COUNT;
8068             }
8069             bitstream[pos]=0;
8070         }
8071
8072         START_TIMER
8073
8074         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8075         if(nal_length<0){
8076             printf("encoding failed\n");
8077             return -1;
8078         }
8079
8080         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8081
8082         STOP_TIMER("NAL")
8083
8084         if(out_length != COUNT){
8085             printf("incorrect length %d %d\n", out_length, COUNT);
8086             return -1;
8087         }
8088
8089         if(consumed != nal_length){
8090             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8091             return -1;
8092         }
8093
8094         if(memcmp(bitstream, out, COUNT)){
8095             printf("mismatch\n");
8096             return -1;
8097         }
8098     }
8099 #endif
8100
8101     printf("Testing RBSP\n");
8102
8103
8104     return 0;
8105 }
8106 #endif /* TEST */
8107
8108
8109 av_cold void ff_h264_free_context(H264Context *h)
8110 {
8111     int i;
8112
8113     av_freep(&h->rbsp_buffer[0]);
8114     av_freep(&h->rbsp_buffer[1]);
8115     free_tables(h); //FIXME cleanup init stuff perhaps
8116
8117     for(i = 0; i < MAX_SPS_COUNT; i++)
8118         av_freep(h->sps_buffers + i);
8119
8120     for(i = 0; i < MAX_PPS_COUNT; i++)
8121         av_freep(h->pps_buffers + i);
8122 }
8123
8124 static av_cold int decode_end(AVCodecContext *avctx)
8125 {
8126     H264Context *h = avctx->priv_data;
8127     MpegEncContext *s = &h->s;
8128
8129     ff_h264_free_context(h);
8130
8131     MPV_common_end(s);
8132
8133 //    memset(h, 0, sizeof(H264Context));
8134
8135     return 0;
8136 }
8137
8138
8139 AVCodec h264_decoder = {
8140     "h264",
8141     CODEC_TYPE_VIDEO,
8142     CODEC_ID_H264,
8143     sizeof(H264Context),
8144     decode_init,
8145     NULL,
8146     decode_end,
8147     decode_frame,
8148     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8149     .flush= flush_dpb,
8150     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8151     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8152 };
8153
8154 #if CONFIG_H264_VDPAU_DECODER
8155 AVCodec h264_vdpau_decoder = {
8156     "h264_vdpau",
8157     CODEC_TYPE_VIDEO,
8158     CODEC_ID_H264,
8159     sizeof(H264Context),
8160     decode_init,
8161     NULL,
8162     decode_end,
8163     decode_frame,
8164     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8165     .flush= flush_dpb,
8166     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8167 };
8168 #endif
8169
8170 #if CONFIG_SVQ3_DECODER
8171 #include "svq3.c"
8172 #endif