libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1591 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1592 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1593     int i;
1594     const int * const quant_table= quant_coeff[qscale];
1595     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1596     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1597     const unsigned int threshold2= (threshold1<<1);
1598     int last_non_zero;
1599
1600     if(separate_dc){
1601         if(qscale<=18){
1602             //avoid overflows
1603             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1604             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1605             const unsigned int dc_threshold2= (dc_threshold1<<1);
1606
1607             int level= block[0]*quant_coeff[qscale+18][0];
1608             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1609                 if(level>0){
1610                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1611                     block[0]= level;
1612                 }else{
1613                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1614                     block[0]= -level;
1615                 }
1616 //                last_non_zero = i;
1617             }else{
1618                 block[0]=0;
1619             }
1620         }else{
1621             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1622             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1623             const unsigned int dc_threshold2= (dc_threshold1<<1);
1624
1625             int level= block[0]*quant_table[0];
1626             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1627                 if(level>0){
1628                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1629                     block[0]= level;
1630                 }else{
1631                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1632                     block[0]= -level;
1633                 }
1634 //                last_non_zero = i;
1635             }else{
1636                 block[0]=0;
1637             }
1638         }
1639         last_non_zero= 0;
1640         i=1;
1641     }else{
1642         last_non_zero= -1;
1643         i=0;
1644     }
1645
1646     for(; i<16; i++){
1647         const int j= scantable[i];
1648         int level= block[j]*quant_table[j];
1649
1650 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1651 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1652         if(((unsigned)(level+threshold1))>threshold2){
1653             if(level>0){
1654                 level= (bias + level)>>QUANT_SHIFT;
1655                 block[j]= level;
1656             }else{
1657                 level= (bias - level)>>QUANT_SHIFT;
1658                 block[j]= -level;
1659             }
1660             last_non_zero = i;
1661         }else{
1662             block[j]=0;
1663         }
1664     }
1665
1666     return last_non_zero;
1667 }
1668
1669 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1670                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1671                            int src_x_offset, int src_y_offset,
1672                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1673     MpegEncContext * const s = &h->s;
1674     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1675     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1676     const int luma_xy= (mx&3) + ((my&3)<<2);
1677     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1678     uint8_t * src_cb, * src_cr;
1679     int extra_width= h->emu_edge_width;
1680     int extra_height= h->emu_edge_height;
1681     int emu=0;
1682     const int full_mx= mx>>2;
1683     const int full_my= my>>2;
1684     const int pic_width  = 16*s->mb_width;
1685     const int pic_height = 16*s->mb_height >> MB_FIELD;
1686
1687     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1688         return;
1689
1690     if(mx&7) extra_width -= 3;
1691     if(my&7) extra_height -= 3;
1692
1693     if(   full_mx < 0-extra_width
1694        || full_my < 0-extra_height
1695        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1696        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1697         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1698             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1699         emu=1;
1700     }
1701
1702     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1703     if(!square){
1704         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1705     }
1706
1707     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1708
1709     if(MB_FIELD){
1710         // chroma offset when predicting from a field of opposite parity
1711         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1712         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1713     }
1714     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1715     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1716
1717     if(emu){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1719             src_cb= s->edge_emu_buffer;
1720     }
1721     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1722
1723     if(emu){
1724         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1725             src_cr= s->edge_emu_buffer;
1726     }
1727     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1728 }
1729
1730 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1731                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1732                            int x_offset, int y_offset,
1733                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1734                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1735                            int list0, int list1){
1736     MpegEncContext * const s = &h->s;
1737     qpel_mc_func *qpix_op=  qpix_put;
1738     h264_chroma_mc_func chroma_op= chroma_put;
1739
1740     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1741     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1742     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1743     x_offset += 8*s->mb_x;
1744     y_offset += 8*(s->mb_y >> MB_FIELD);
1745
1746     if(list0){
1747         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1748         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1749                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1750                            qpix_op, chroma_op);
1751
1752         qpix_op=  qpix_avg;
1753         chroma_op= chroma_avg;
1754     }
1755
1756     if(list1){
1757         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1758         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1759                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1760                            qpix_op, chroma_op);
1761     }
1762 }
1763
1764 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1765                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1766                            int x_offset, int y_offset,
1767                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1768                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1769                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1770                            int list0, int list1){
1771     MpegEncContext * const s = &h->s;
1772
1773     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1774     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1775     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1776     x_offset += 8*s->mb_x;
1777     y_offset += 8*(s->mb_y >> MB_FIELD);
1778
1779     if(list0 && list1){
1780         /* don't optimize for luma-only case, since B-frames usually
1781          * use implicit weights => chroma too. */
1782         uint8_t *tmp_cb = s->obmc_scratchpad;
1783         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1784         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1785         int refn0 = h->ref_cache[0][ scan8[n] ];
1786         int refn1 = h->ref_cache[1][ scan8[n] ];
1787
1788         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1789                     dest_y, dest_cb, dest_cr,
1790                     x_offset, y_offset, qpix_put, chroma_put);
1791         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1792                     tmp_y, tmp_cb, tmp_cr,
1793                     x_offset, y_offset, qpix_put, chroma_put);
1794
1795         if(h->use_weight == 2){
1796             int weight0 = h->implicit_weight[refn0][refn1];
1797             int weight1 = 64 - weight0;
1798             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1799             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1800             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1801         }else{
1802             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1803                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1804                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1805             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1806                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1807                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1808             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1809                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1810                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1811         }
1812     }else{
1813         int list = list1 ? 1 : 0;
1814         int refn = h->ref_cache[list][ scan8[n] ];
1815         Picture *ref= &h->ref_list[list][refn];
1816         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1817                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1818                     qpix_put, chroma_put);
1819
1820         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1821                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1822         if(h->use_weight_chroma){
1823             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1824                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1825             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1826                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1827         }
1828     }
1829 }
1830
1831 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1832                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1833                            int x_offset, int y_offset,
1834                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1835                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1836                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1837                            int list0, int list1){
1838     if((h->use_weight==2 && list0 && list1
1839         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1840        || h->use_weight==1)
1841         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1842                          x_offset, y_offset, qpix_put, chroma_put,
1843                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1844     else
1845         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1846                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1847 }
1848
1849 static inline void prefetch_motion(H264Context *h, int list){
1850     /* fetch pixels for estimated mv 4 macroblocks ahead
1851      * optimized for 64byte cache lines */
1852     MpegEncContext * const s = &h->s;
1853     const int refn = h->ref_cache[list][scan8[0]];
1854     if(refn >= 0){
1855         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1856         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1857         uint8_t **src= h->ref_list[list][refn].data;
1858         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1859         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1860         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1861         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1862     }
1863 }
1864
1865 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1866                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1867                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1868                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1869     MpegEncContext * const s = &h->s;
1870     const int mb_xy= h->mb_xy;
1871     const int mb_type= s->current_picture.mb_type[mb_xy];
1872
1873     assert(IS_INTER(mb_type));
1874
1875     prefetch_motion(h, 0);
1876
1877     if(IS_16X16(mb_type)){
1878         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1879                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1880                 &weight_op[0], &weight_avg[0],
1881                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1882     }else if(IS_16X8(mb_type)){
1883         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1884                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1885                 &weight_op[1], &weight_avg[1],
1886                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1887         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1888                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1889                 &weight_op[1], &weight_avg[1],
1890                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1891     }else if(IS_8X16(mb_type)){
1892         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1893                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1894                 &weight_op[2], &weight_avg[2],
1895                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1896         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1897                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1898                 &weight_op[2], &weight_avg[2],
1899                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1900     }else{
1901         int i;
1902
1903         assert(IS_8X8(mb_type));
1904
1905         for(i=0; i<4; i++){
1906             const int sub_mb_type= h->sub_mb_type[i];
1907             const int n= 4*i;
1908             int x_offset= (i&1)<<2;
1909             int y_offset= (i&2)<<1;
1910
1911             if(IS_SUB_8X8(sub_mb_type)){
1912                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1913                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1914                     &weight_op[3], &weight_avg[3],
1915                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1916             }else if(IS_SUB_8X4(sub_mb_type)){
1917                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1918                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1919                     &weight_op[4], &weight_avg[4],
1920                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1921                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1922                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1923                     &weight_op[4], &weight_avg[4],
1924                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925             }else if(IS_SUB_4X8(sub_mb_type)){
1926                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1927                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1928                     &weight_op[5], &weight_avg[5],
1929                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1930                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1931                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1932                     &weight_op[5], &weight_avg[5],
1933                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1934             }else{
1935                 int j;
1936                 assert(IS_SUB_4X4(sub_mb_type));
1937                 for(j=0; j<4; j++){
1938                     int sub_x_offset= x_offset + 2*(j&1);
1939                     int sub_y_offset= y_offset +   (j&2);
1940                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1941                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1942                         &weight_op[6], &weight_avg[6],
1943                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1944                 }
1945             }
1946         }
1947     }
1948
1949     prefetch_motion(h, 1);
1950 }
1951
1952 static av_cold void decode_init_vlc(void){
1953     static int done = 0;
1954
1955     if (!done) {
1956         int i;
1957         int offset;
1958         done = 1;
1959
1960         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1961         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1962         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1963                  &chroma_dc_coeff_token_len [0], 1, 1,
1964                  &chroma_dc_coeff_token_bits[0], 1, 1,
1965                  INIT_VLC_USE_NEW_STATIC);
1966
1967         offset = 0;
1968         for(i=0; i<4; i++){
1969             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1970             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1971             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1972                      &coeff_token_len [i][0], 1, 1,
1973                      &coeff_token_bits[i][0], 1, 1,
1974                      INIT_VLC_USE_NEW_STATIC);
1975             offset += coeff_token_vlc_tables_size[i];
1976         }
1977         /*
1978          * This is a one time safety check to make sure that
1979          * the packed static coeff_token_vlc table sizes
1980          * were initialized correctly.
1981          */
1982         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1983
1984         for(i=0; i<3; i++){
1985             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1986             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1987             init_vlc(&chroma_dc_total_zeros_vlc[i],
1988                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1989                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1990                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1991                      INIT_VLC_USE_NEW_STATIC);
1992         }
1993         for(i=0; i<15; i++){
1994             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1995             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1996             init_vlc(&total_zeros_vlc[i],
1997                      TOTAL_ZEROS_VLC_BITS, 16,
1998                      &total_zeros_len [i][0], 1, 1,
1999                      &total_zeros_bits[i][0], 1, 1,
2000                      INIT_VLC_USE_NEW_STATIC);
2001         }
2002
2003         for(i=0; i<6; i++){
2004             run_vlc[i].table = run_vlc_tables[i];
2005             run_vlc[i].table_allocated = run_vlc_tables_size;
2006             init_vlc(&run_vlc[i],
2007                      RUN_VLC_BITS, 7,
2008                      &run_len [i][0], 1, 1,
2009                      &run_bits[i][0], 1, 1,
2010                      INIT_VLC_USE_NEW_STATIC);
2011         }
2012         run7_vlc.table = run7_vlc_table,
2013         run7_vlc.table_allocated = run7_vlc_table_size;
2014         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2015                  &run_len [6][0], 1, 1,
2016                  &run_bits[6][0], 1, 1,
2017                  INIT_VLC_USE_NEW_STATIC);
2018     }
2019 }
2020
2021 static void free_tables(H264Context *h){
2022     int i;
2023     H264Context *hx;
2024     av_freep(&h->intra4x4_pred_mode);
2025     av_freep(&h->chroma_pred_mode_table);
2026     av_freep(&h->cbp_table);
2027     av_freep(&h->mvd_table[0]);
2028     av_freep(&h->mvd_table[1]);
2029     av_freep(&h->direct_table);
2030     av_freep(&h->non_zero_count);
2031     av_freep(&h->slice_table_base);
2032     h->slice_table= NULL;
2033
2034     av_freep(&h->mb2b_xy);
2035     av_freep(&h->mb2b8_xy);
2036
2037     for(i = 0; i < h->s.avctx->thread_count; i++) {
2038         hx = h->thread_context[i];
2039         if(!hx) continue;
2040         av_freep(&hx->top_borders[1]);
2041         av_freep(&hx->top_borders[0]);
2042         av_freep(&hx->s.obmc_scratchpad);
2043     }
2044 }
2045
2046 static void init_dequant8_coeff_table(H264Context *h){
2047     int i,q,x;
2048     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2049     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2050     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2051
2052     for(i=0; i<2; i++ ){
2053         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2054             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2055             break;
2056         }
2057
2058         for(q=0; q<52; q++){
2059             int shift = div6[q];
2060             int idx = rem6[q];
2061             for(x=0; x<64; x++)
2062                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2063                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2064                     h->pps.scaling_matrix8[i][x]) << shift;
2065         }
2066     }
2067 }
2068
2069 static void init_dequant4_coeff_table(H264Context *h){
2070     int i,j,q,x;
2071     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2072     for(i=0; i<6; i++ ){
2073         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2074         for(j=0; j<i; j++){
2075             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2076                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2077                 break;
2078             }
2079         }
2080         if(j<i)
2081             continue;
2082
2083         for(q=0; q<52; q++){
2084             int shift = div6[q] + 2;
2085             int idx = rem6[q];
2086             for(x=0; x<16; x++)
2087                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2088                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2089                     h->pps.scaling_matrix4[i][x]) << shift;
2090         }
2091     }
2092 }
2093
2094 static void init_dequant_tables(H264Context *h){
2095     int i,x;
2096     init_dequant4_coeff_table(h);
2097     if(h->pps.transform_8x8_mode)
2098         init_dequant8_coeff_table(h);
2099     if(h->sps.transform_bypass){
2100         for(i=0; i<6; i++)
2101             for(x=0; x<16; x++)
2102                 h->dequant4_coeff[i][0][x] = 1<<6;
2103         if(h->pps.transform_8x8_mode)
2104             for(i=0; i<2; i++)
2105                 for(x=0; x<64; x++)
2106                     h->dequant8_coeff[i][0][x] = 1<<6;
2107     }
2108 }
2109
2110
2111 /**
2112  * allocates tables.
2113  * needs width/height
2114  */
2115 static int alloc_tables(H264Context *h){
2116     MpegEncContext * const s = &h->s;
2117     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2118     int x,y;
2119
2120     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2121
2122     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2123     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2124     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2125
2126     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2127     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2128     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2129     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2130
2131     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2132     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2133
2134     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2135     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2136     for(y=0; y<s->mb_height; y++){
2137         for(x=0; x<s->mb_width; x++){
2138             const int mb_xy= x + y*s->mb_stride;
2139             const int b_xy = 4*x + 4*y*h->b_stride;
2140             const int b8_xy= 2*x + 2*y*h->b8_stride;
2141
2142             h->mb2b_xy [mb_xy]= b_xy;
2143             h->mb2b8_xy[mb_xy]= b8_xy;
2144         }
2145     }
2146
2147     s->obmc_scratchpad = NULL;
2148
2149     if(!h->dequant4_coeff[0])
2150         init_dequant_tables(h);
2151
2152     return 0;
2153 fail:
2154     free_tables(h);
2155     return -1;
2156 }
2157
2158 /**
2159  * Mimic alloc_tables(), but for every context thread.
2160  */
2161 static void clone_tables(H264Context *dst, H264Context *src){
2162     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2163     dst->non_zero_count           = src->non_zero_count;
2164     dst->slice_table              = src->slice_table;
2165     dst->cbp_table                = src->cbp_table;
2166     dst->mb2b_xy                  = src->mb2b_xy;
2167     dst->mb2b8_xy                 = src->mb2b8_xy;
2168     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2169     dst->mvd_table[0]             = src->mvd_table[0];
2170     dst->mvd_table[1]             = src->mvd_table[1];
2171     dst->direct_table             = src->direct_table;
2172
2173     dst->s.obmc_scratchpad = NULL;
2174     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2175 }
2176
2177 /**
2178  * Init context
2179  * Allocate buffers which are not shared amongst multiple threads.
2180  */
2181 static int context_init(H264Context *h){
2182     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2183     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2184
2185     return 0;
2186 fail:
2187     return -1; // free_tables will clean up for us
2188 }
2189
2190 static av_cold void common_init(H264Context *h){
2191     MpegEncContext * const s = &h->s;
2192
2193     s->width = s->avctx->width;
2194     s->height = s->avctx->height;
2195     s->codec_id= s->avctx->codec->id;
2196
2197     ff_h264_pred_init(&h->hpc, s->codec_id);
2198
2199     h->dequant_coeff_pps= -1;
2200     s->unrestricted_mv=1;
2201     s->decode=1; //FIXME
2202
2203     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2204     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2205 }
2206
2207 static av_cold int decode_init(AVCodecContext *avctx){
2208     H264Context *h= avctx->priv_data;
2209     MpegEncContext * const s = &h->s;
2210
2211     MPV_decode_defaults(s);
2212
2213     s->avctx = avctx;
2214     common_init(h);
2215
2216     s->out_format = FMT_H264;
2217     s->workaround_bugs= avctx->workaround_bugs;
2218
2219     // set defaults
2220 //    s->decode_mb= ff_h263_decode_mb;
2221     s->quarter_sample = 1;
2222     s->low_delay= 1;
2223
2224     if(avctx->codec_id == CODEC_ID_SVQ3)
2225         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2226     else
2227         avctx->pix_fmt= PIX_FMT_YUV420P;
2228
2229     decode_init_vlc();
2230
2231     if(avctx->extradata_size > 0 && avctx->extradata &&
2232        *(char *)avctx->extradata == 1){
2233         h->is_avc = 1;
2234         h->got_avcC = 0;
2235     } else {
2236         h->is_avc = 0;
2237     }
2238
2239     h->thread_context[0] = h;
2240     h->outputed_poc = INT_MIN;
2241     h->prev_poc_msb= 1<<16;
2242     return 0;
2243 }
2244
2245 static int frame_start(H264Context *h){
2246     MpegEncContext * const s = &h->s;
2247     int i;
2248
2249     if(MPV_frame_start(s, s->avctx) < 0)
2250         return -1;
2251     ff_er_frame_start(s);
2252     /*
2253      * MPV_frame_start uses pict_type to derive key_frame.
2254      * This is incorrect for H.264; IDR markings must be used.
2255      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2256      * See decode_nal_units().
2257      */
2258     s->current_picture_ptr->key_frame= 0;
2259
2260     assert(s->linesize && s->uvlinesize);
2261
2262     for(i=0; i<16; i++){
2263         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2264         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2265     }
2266     for(i=0; i<4; i++){
2267         h->block_offset[16+i]=
2268         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2269         h->block_offset[24+16+i]=
2270         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2271     }
2272
2273     /* can't be in alloc_tables because linesize isn't known there.
2274      * FIXME: redo bipred weight to not require extra buffer? */
2275     for(i = 0; i < s->avctx->thread_count; i++)
2276         if(!h->thread_context[i]->s.obmc_scratchpad)
2277             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2278
2279     /* some macroblocks will be accessed before they're available */
2280     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2281         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2282
2283 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2284
2285     // We mark the current picture as non-reference after allocating it, so
2286     // that if we break out due to an error it can be released automatically
2287     // in the next MPV_frame_start().
2288     // SVQ3 as well as most other codecs have only last/next/current and thus
2289     // get released even with set reference, besides SVQ3 and others do not
2290     // mark frames as reference later "naturally".
2291     if(s->codec_id != CODEC_ID_SVQ3)
2292         s->current_picture_ptr->reference= 0;
2293
2294     s->current_picture_ptr->field_poc[0]=
2295     s->current_picture_ptr->field_poc[1]= INT_MAX;
2296     assert(s->current_picture_ptr->long_ref==0);
2297
2298     return 0;
2299 }
2300
2301 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2302     MpegEncContext * const s = &h->s;
2303     int i;
2304     int step    = 1;
2305     int offset  = 1;
2306     int uvoffset= 1;
2307     int top_idx = 1;
2308     int skiplast= 0;
2309
2310     src_y  -=   linesize;
2311     src_cb -= uvlinesize;
2312     src_cr -= uvlinesize;
2313
2314     if(!simple && FRAME_MBAFF){
2315         if(s->mb_y&1){
2316             offset  = MB_MBAFF ? 1 : 17;
2317             uvoffset= MB_MBAFF ? 1 : 9;
2318             if(!MB_MBAFF){
2319                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2320                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2321                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2322                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2323                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2324                 }
2325             }
2326         }else{
2327             if(!MB_MBAFF){
2328                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2329                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2330                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2331                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2332                 }
2333                 skiplast= 1;
2334             }
2335             offset  =
2336             uvoffset=
2337             top_idx = MB_MBAFF ? 0 : 1;
2338         }
2339         step= MB_MBAFF ? 2 : 1;
2340     }
2341
2342     // There are two lines saved, the line above the the top macroblock of a pair,
2343     // and the line above the bottom macroblock
2344     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2345     for(i=1; i<17 - skiplast; i++){
2346         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2347     }
2348
2349     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2350     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2351
2352     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2353         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2354         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2355         for(i=1; i<9 - skiplast; i++){
2356             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2357             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2358         }
2359         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2360         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2361     }
2362 }
2363
2364 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2365     MpegEncContext * const s = &h->s;
2366     int temp8, i;
2367     uint64_t temp64;
2368     int deblock_left;
2369     int deblock_top;
2370     int mb_xy;
2371     int step    = 1;
2372     int offset  = 1;
2373     int uvoffset= 1;
2374     int top_idx = 1;
2375
2376     if(!simple && FRAME_MBAFF){
2377         if(s->mb_y&1){
2378             offset  = MB_MBAFF ? 1 : 17;
2379             uvoffset= MB_MBAFF ? 1 : 9;
2380         }else{
2381             offset  =
2382             uvoffset=
2383             top_idx = MB_MBAFF ? 0 : 1;
2384         }
2385         step= MB_MBAFF ? 2 : 1;
2386     }
2387
2388     if(h->deblocking_filter == 2) {
2389         mb_xy = h->mb_xy;
2390         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2391         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2392     } else {
2393         deblock_left = (s->mb_x > 0);
2394         deblock_top =  (s->mb_y > !!MB_FIELD);
2395     }
2396
2397     src_y  -=   linesize + 1;
2398     src_cb -= uvlinesize + 1;
2399     src_cr -= uvlinesize + 1;
2400
2401 #define XCHG(a,b,t,xchg)\
2402 t= a;\
2403 if(xchg)\
2404     a= b;\
2405 b= t;
2406
2407     if(deblock_left){
2408         for(i = !deblock_top; i<16; i++){
2409             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2410         }
2411         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2412     }
2413
2414     if(deblock_top){
2415         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2416         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2417         if(s->mb_x+1 < s->mb_width){
2418             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2419         }
2420     }
2421
2422     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2423         if(deblock_left){
2424             for(i = !deblock_top; i<8; i++){
2425                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2426                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2427             }
2428             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2429             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2430         }
2431         if(deblock_top){
2432             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2433             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2434         }
2435     }
2436 }
2437
2438 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2439     MpegEncContext * const s = &h->s;
2440     const int mb_x= s->mb_x;
2441     const int mb_y= s->mb_y;
2442     const int mb_xy= h->mb_xy;
2443     const int mb_type= s->current_picture.mb_type[mb_xy];
2444     uint8_t  *dest_y, *dest_cb, *dest_cr;
2445     int linesize, uvlinesize /*dct_offset*/;
2446     int i;
2447     int *block_offset = &h->block_offset[0];
2448     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2449     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2450     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2451
2452     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2453     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2454     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2455
2456     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2457     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2458
2459     if (!simple && MB_FIELD) {
2460         linesize   = h->mb_linesize   = s->linesize * 2;
2461         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2462         block_offset = &h->block_offset[24];
2463         if(mb_y&1){ //FIXME move out of this function?
2464             dest_y -= s->linesize*15;
2465             dest_cb-= s->uvlinesize*7;
2466             dest_cr-= s->uvlinesize*7;
2467         }
2468         if(FRAME_MBAFF) {
2469             int list;
2470             for(list=0; list<h->list_count; list++){
2471                 if(!USES_LIST(mb_type, list))
2472                     continue;
2473                 if(IS_16X16(mb_type)){
2474                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2475                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2476                 }else{
2477                     for(i=0; i<16; i+=4){
2478                         int ref = h->ref_cache[list][scan8[i]];
2479                         if(ref >= 0)
2480                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2481                     }
2482                 }
2483             }
2484         }
2485     } else {
2486         linesize   = h->mb_linesize   = s->linesize;
2487         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2488 //        dct_offset = s->linesize * 16;
2489     }
2490
2491     if(transform_bypass){
2492         idct_dc_add =
2493         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2494     }else if(IS_8x8DCT(mb_type)){
2495         idct_dc_add = s->dsp.h264_idct8_dc_add;
2496         idct_add = s->dsp.h264_idct8_add;
2497     }else{
2498         idct_dc_add = s->dsp.h264_idct_dc_add;
2499         idct_add = s->dsp.h264_idct_add;
2500     }
2501
2502     if (!simple && IS_INTRA_PCM(mb_type)) {
2503         for (i=0; i<16; i++) {
2504             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2505         }
2506         for (i=0; i<8; i++) {
2507             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2508             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2509         }
2510     } else {
2511         if(IS_INTRA(mb_type)){
2512             if(h->deblocking_filter)
2513                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2514
2515             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2516                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2517                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2518             }
2519
2520             if(IS_INTRA4x4(mb_type)){
2521                 if(simple || !s->encoding){
2522                     if(IS_8x8DCT(mb_type)){
2523                         for(i=0; i<16; i+=4){
2524                             uint8_t * const ptr= dest_y + block_offset[i];
2525                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2526                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2527                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2528                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2529                             if(nnz){
2530                                 if(nnz == 1 && h->mb[i*16])
2531                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2532                                 else
2533                                     idct_add(ptr, h->mb + i*16, linesize);
2534                             }
2535                         }
2536                     }else
2537                     for(i=0; i<16; i++){
2538                         uint8_t * const ptr= dest_y + block_offset[i];
2539                         uint8_t *topright;
2540                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2541                         int nnz, tr;
2542
2543                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2544                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2545                             assert(mb_y || linesize <= block_offset[i]);
2546                             if(!topright_avail){
2547                                 tr= ptr[3 - linesize]*0x01010101;
2548                                 topright= (uint8_t*) &tr;
2549                             }else
2550                                 topright= ptr + 4 - linesize;
2551                         }else
2552                             topright= NULL;
2553
2554                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2555                         nnz = h->non_zero_count_cache[ scan8[i] ];
2556                         if(nnz){
2557                             if(is_h264){
2558                                 if(nnz == 1 && h->mb[i*16])
2559                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2560                                 else
2561                                     idct_add(ptr, h->mb + i*16, linesize);
2562                             }else
2563                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2564                         }
2565                     }
2566                 }
2567             }else{
2568                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2569                 if(is_h264){
2570                     if(!transform_bypass)
2571                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2572                 }else
2573                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2574             }
2575             if(h->deblocking_filter)
2576                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2577         }else if(is_h264){
2578             hl_motion(h, dest_y, dest_cb, dest_cr,
2579                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2580                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2581                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2582         }
2583
2584
2585         if(!IS_INTRA4x4(mb_type)){
2586             if(is_h264){
2587                 if(IS_INTRA16x16(mb_type)){
2588                     for(i=0; i<16; i++){
2589                         if(h->non_zero_count_cache[ scan8[i] ])
2590                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2591                         else if(h->mb[i*16])
2592                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2593                     }
2594                 }else{
2595                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2596                     for(i=0; i<16; i+=di){
2597                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2598                         if(nnz){
2599                             if(nnz==1 && h->mb[i*16])
2600                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2601                             else
2602                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2603                         }
2604                     }
2605                 }
2606             }else{
2607                 for(i=0; i<16; i++){
2608                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2609                         uint8_t * const ptr= dest_y + block_offset[i];
2610                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2611                     }
2612                 }
2613             }
2614         }
2615
2616         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2617             uint8_t *dest[2] = {dest_cb, dest_cr};
2618             if(transform_bypass){
2619                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2620             }else{
2621                 idct_add = s->dsp.h264_idct_add;
2622                 idct_dc_add = s->dsp.h264_idct_dc_add;
2623                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2624                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2625             }
2626             if(is_h264){
2627                 for(i=16; i<16+8; i++){
2628                     if(h->non_zero_count_cache[ scan8[i] ])
2629                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2630                     else if(h->mb[i*16])
2631                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2632                 }
2633             }else{
2634                 for(i=16; i<16+8; i++){
2635                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2636                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2637                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2638                     }
2639                 }
2640             }
2641         }
2642     }
2643     if(h->deblocking_filter) {
2644         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2645         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2646         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2647         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2648         if (!simple && FRAME_MBAFF) {
2649             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2650         } else {
2651             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2652         }
2653     }
2654 }
2655
2656 /**
2657  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2658  */
2659 static void hl_decode_mb_simple(H264Context *h){
2660     hl_decode_mb_internal(h, 1);
2661 }
2662
2663 /**
2664  * Process a macroblock; this handles edge cases, such as interlacing.
2665  */
2666 static void av_noinline hl_decode_mb_complex(H264Context *h){
2667     hl_decode_mb_internal(h, 0);
2668 }
2669
2670 static void hl_decode_mb(H264Context *h){
2671     MpegEncContext * const s = &h->s;
2672     const int mb_xy= h->mb_xy;
2673     const int mb_type= s->current_picture.mb_type[mb_xy];
2674     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2675                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2676
2677     if(ENABLE_H264_ENCODER && !s->decode)
2678         return;
2679
2680     if (is_complex)
2681         hl_decode_mb_complex(h);
2682     else hl_decode_mb_simple(h);
2683 }
2684
2685 static void pic_as_field(Picture *pic, const int parity){
2686     int i;
2687     for (i = 0; i < 4; ++i) {
2688         if (parity == PICT_BOTTOM_FIELD)
2689             pic->data[i] += pic->linesize[i];
2690         pic->reference = parity;
2691         pic->linesize[i] *= 2;
2692     }
2693     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2694 }
2695
2696 static int split_field_copy(Picture *dest, Picture *src,
2697                             int parity, int id_add){
2698     int match = !!(src->reference & parity);
2699
2700     if (match) {
2701         *dest = *src;
2702         if(parity != PICT_FRAME){
2703             pic_as_field(dest, parity);
2704             dest->pic_id *= 2;
2705             dest->pic_id += id_add;
2706         }
2707     }
2708
2709     return match;
2710 }
2711
2712 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2713     int i[2]={0};
2714     int index=0;
2715
2716     while(i[0]<len || i[1]<len){
2717         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2718             i[0]++;
2719         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2720             i[1]++;
2721         if(i[0] < len){
2722             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2723             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2724         }
2725         if(i[1] < len){
2726             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2727             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2728         }
2729     }
2730
2731     return index;
2732 }
2733
2734 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2735     int i, best_poc;
2736     int out_i= 0;
2737
2738     for(;;){
2739         best_poc= dir ? INT_MIN : INT_MAX;
2740
2741         for(i=0; i<len; i++){
2742             const int poc= src[i]->poc;
2743             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2744                 best_poc= poc;
2745                 sorted[out_i]= src[i];
2746             }
2747         }
2748         if(best_poc == (dir ? INT_MIN : INT_MAX))
2749             break;
2750         limit= sorted[out_i++]->poc - dir;
2751     }
2752     return out_i;
2753 }
2754
2755 /**
2756  * fills the default_ref_list.
2757  */
2758 static int fill_default_ref_list(H264Context *h){
2759     MpegEncContext * const s = &h->s;
2760     int i, len;
2761
2762     if(h->slice_type_nos==FF_B_TYPE){
2763         Picture *sorted[32];
2764         int cur_poc, list;
2765         int lens[2];
2766
2767         if(FIELD_PICTURE)
2768             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2769         else
2770             cur_poc= s->current_picture_ptr->poc;
2771
2772         for(list= 0; list<2; list++){
2773             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2774             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2775             assert(len<=32);
2776             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2777             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2778             assert(len<=32);
2779
2780             if(len < h->ref_count[list])
2781                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2782             lens[list]= len;
2783         }
2784
2785         if(lens[0] == lens[1] && lens[1] > 1){
2786             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2787             if(i == lens[0])
2788                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2789         }
2790     }else{
2791         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2792         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2793         assert(len <= 32);
2794         if(len < h->ref_count[0])
2795             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2796     }
2797 #ifdef TRACE
2798     for (i=0; i<h->ref_count[0]; i++) {
2799         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2800     }
2801     if(h->slice_type_nos==FF_B_TYPE){
2802         for (i=0; i<h->ref_count[1]; i++) {
2803             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2804         }
2805     }
2806 #endif
2807     return 0;
2808 }
2809
2810 static void print_short_term(H264Context *h);
2811 static void print_long_term(H264Context *h);
2812
2813 /**
2814  * Extract structure information about the picture described by pic_num in
2815  * the current decoding context (frame or field). Note that pic_num is
2816  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2817  * @param pic_num picture number for which to extract structure information
2818  * @param structure one of PICT_XXX describing structure of picture
2819  *                      with pic_num
2820  * @return frame number (short term) or long term index of picture
2821  *         described by pic_num
2822  */
2823 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2824     MpegEncContext * const s = &h->s;
2825
2826     *structure = s->picture_structure;
2827     if(FIELD_PICTURE){
2828         if (!(pic_num & 1))
2829             /* opposite field */
2830             *structure ^= PICT_FRAME;
2831         pic_num >>= 1;
2832     }
2833
2834     return pic_num;
2835 }
2836
2837 static int decode_ref_pic_list_reordering(H264Context *h){
2838     MpegEncContext * const s = &h->s;
2839     int list, index, pic_structure;
2840
2841     print_short_term(h);
2842     print_long_term(h);
2843
2844     for(list=0; list<h->list_count; list++){
2845         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2846
2847         if(get_bits1(&s->gb)){
2848             int pred= h->curr_pic_num;
2849
2850             for(index=0; ; index++){
2851                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2852                 unsigned int pic_id;
2853                 int i;
2854                 Picture *ref = NULL;
2855
2856                 if(reordering_of_pic_nums_idc==3)
2857                     break;
2858
2859                 if(index >= h->ref_count[list]){
2860                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2861                     return -1;
2862                 }
2863
2864                 if(reordering_of_pic_nums_idc<3){
2865                     if(reordering_of_pic_nums_idc<2){
2866                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2867                         int frame_num;
2868
2869                         if(abs_diff_pic_num > h->max_pic_num){
2870                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2871                             return -1;
2872                         }
2873
2874                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2875                         else                                pred+= abs_diff_pic_num;
2876                         pred &= h->max_pic_num - 1;
2877
2878                         frame_num = pic_num_extract(h, pred, &pic_structure);
2879
2880                         for(i= h->short_ref_count-1; i>=0; i--){
2881                             ref = h->short_ref[i];
2882                             assert(ref->reference);
2883                             assert(!ref->long_ref);
2884                             if(
2885                                    ref->frame_num == frame_num &&
2886                                    (ref->reference & pic_structure)
2887                               )
2888                                 break;
2889                         }
2890                         if(i>=0)
2891                             ref->pic_id= pred;
2892                     }else{
2893                         int long_idx;
2894                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2895
2896                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2897
2898                         if(long_idx>31){
2899                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2900                             return -1;
2901                         }
2902                         ref = h->long_ref[long_idx];
2903                         assert(!(ref && !ref->reference));
2904                         if(ref && (ref->reference & pic_structure)){
2905                             ref->pic_id= pic_id;
2906                             assert(ref->long_ref);
2907                             i=0;
2908                         }else{
2909                             i=-1;
2910                         }
2911                     }
2912
2913                     if (i < 0) {
2914                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2915                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2916                     } else {
2917                         for(i=index; i+1<h->ref_count[list]; i++){
2918                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2919                                 break;
2920                         }
2921                         for(; i > index; i--){
2922                             h->ref_list[list][i]= h->ref_list[list][i-1];
2923                         }
2924                         h->ref_list[list][index]= *ref;
2925                         if (FIELD_PICTURE){
2926                             pic_as_field(&h->ref_list[list][index], pic_structure);
2927                         }
2928                     }
2929                 }else{
2930                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2931                     return -1;
2932                 }
2933             }
2934         }
2935     }
2936     for(list=0; list<h->list_count; list++){
2937         for(index= 0; index < h->ref_count[list]; index++){
2938             if(!h->ref_list[list][index].data[0]){
2939                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2940                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2941             }
2942         }
2943     }
2944
2945     return 0;
2946 }
2947
2948 static void fill_mbaff_ref_list(H264Context *h){
2949     int list, i, j;
2950     for(list=0; list<2; list++){ //FIXME try list_count
2951         for(i=0; i<h->ref_count[list]; i++){
2952             Picture *frame = &h->ref_list[list][i];
2953             Picture *field = &h->ref_list[list][16+2*i];
2954             field[0] = *frame;
2955             for(j=0; j<3; j++)
2956                 field[0].linesize[j] <<= 1;
2957             field[0].reference = PICT_TOP_FIELD;
2958             field[0].poc= field[0].field_poc[0];
2959             field[1] = field[0];
2960             for(j=0; j<3; j++)
2961                 field[1].data[j] += frame->linesize[j];
2962             field[1].reference = PICT_BOTTOM_FIELD;
2963             field[1].poc= field[1].field_poc[1];
2964
2965             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2966             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2967             for(j=0; j<2; j++){
2968                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2969                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2970             }
2971         }
2972     }
2973     for(j=0; j<h->ref_count[1]; j++){
2974         for(i=0; i<h->ref_count[0]; i++)
2975             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2976         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2977         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2978     }
2979 }
2980
2981 static int pred_weight_table(H264Context *h){
2982     MpegEncContext * const s = &h->s;
2983     int list, i;
2984     int luma_def, chroma_def;
2985
2986     h->use_weight= 0;
2987     h->use_weight_chroma= 0;
2988     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2989     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2990     luma_def = 1<<h->luma_log2_weight_denom;
2991     chroma_def = 1<<h->chroma_log2_weight_denom;
2992
2993     for(list=0; list<2; list++){
2994         for(i=0; i<h->ref_count[list]; i++){
2995             int luma_weight_flag, chroma_weight_flag;
2996
2997             luma_weight_flag= get_bits1(&s->gb);
2998             if(luma_weight_flag){
2999                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3000                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3001                 if(   h->luma_weight[list][i] != luma_def
3002                    || h->luma_offset[list][i] != 0)
3003                     h->use_weight= 1;
3004             }else{
3005                 h->luma_weight[list][i]= luma_def;
3006                 h->luma_offset[list][i]= 0;
3007             }
3008
3009             if(CHROMA){
3010                 chroma_weight_flag= get_bits1(&s->gb);
3011                 if(chroma_weight_flag){
3012                     int j;
3013                     for(j=0; j<2; j++){
3014                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3015                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3016                         if(   h->chroma_weight[list][i][j] != chroma_def
3017                         || h->chroma_offset[list][i][j] != 0)
3018                             h->use_weight_chroma= 1;
3019                     }
3020                 }else{
3021                     int j;
3022                     for(j=0; j<2; j++){
3023                         h->chroma_weight[list][i][j]= chroma_def;
3024                         h->chroma_offset[list][i][j]= 0;
3025                     }
3026                 }
3027             }
3028         }
3029         if(h->slice_type_nos != FF_B_TYPE) break;
3030     }
3031     h->use_weight= h->use_weight || h->use_weight_chroma;
3032     return 0;
3033 }
3034
3035 static void implicit_weight_table(H264Context *h){
3036     MpegEncContext * const s = &h->s;
3037     int ref0, ref1;
3038     int cur_poc = s->current_picture_ptr->poc;
3039
3040     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3041        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3042         h->use_weight= 0;
3043         h->use_weight_chroma= 0;
3044         return;
3045     }
3046
3047     h->use_weight= 2;
3048     h->use_weight_chroma= 2;
3049     h->luma_log2_weight_denom= 5;
3050     h->chroma_log2_weight_denom= 5;
3051
3052     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3053         int poc0 = h->ref_list[0][ref0].poc;
3054         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3055             int poc1 = h->ref_list[1][ref1].poc;
3056             int td = av_clip(poc1 - poc0, -128, 127);
3057             if(td){
3058                 int tb = av_clip(cur_poc - poc0, -128, 127);
3059                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3060                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3061                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3062                     h->implicit_weight[ref0][ref1] = 32;
3063                 else
3064                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3065             }else
3066                 h->implicit_weight[ref0][ref1] = 32;
3067         }
3068     }
3069 }
3070
3071 /**
3072  * Mark a picture as no longer needed for reference. The refmask
3073  * argument allows unreferencing of individual fields or the whole frame.
3074  * If the picture becomes entirely unreferenced, but is being held for
3075  * display purposes, it is marked as such.
3076  * @param refmask mask of fields to unreference; the mask is bitwise
3077  *                anded with the reference marking of pic
3078  * @return non-zero if pic becomes entirely unreferenced (except possibly
3079  *         for display purposes) zero if one of the fields remains in
3080  *         reference
3081  */
3082 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3083     int i;
3084     if (pic->reference &= refmask) {
3085         return 0;
3086     } else {
3087         for(i = 0; h->delayed_pic[i]; i++)
3088             if(pic == h->delayed_pic[i]){
3089                 pic->reference=DELAYED_PIC_REF;
3090                 break;
3091             }
3092         return 1;
3093     }
3094 }
3095
3096 /**
3097  * instantaneous decoder refresh.
3098  */
3099 static void idr(H264Context *h){
3100     int i;
3101
3102     for(i=0; i<16; i++){
3103         remove_long(h, i, 0);
3104     }
3105     assert(h->long_ref_count==0);
3106
3107     for(i=0; i<h->short_ref_count; i++){
3108         unreference_pic(h, h->short_ref[i], 0);
3109         h->short_ref[i]= NULL;
3110     }
3111     h->short_ref_count=0;
3112     h->prev_frame_num= 0;
3113     h->prev_frame_num_offset= 0;
3114     h->prev_poc_msb=
3115     h->prev_poc_lsb= 0;
3116 }
3117
3118 /* forget old pics after a seek */
3119 static void flush_dpb(AVCodecContext *avctx){
3120     H264Context *h= avctx->priv_data;
3121     int i;
3122     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3123         if(h->delayed_pic[i])
3124             h->delayed_pic[i]->reference= 0;
3125         h->delayed_pic[i]= NULL;
3126     }
3127     h->outputed_poc= INT_MIN;
3128     idr(h);
3129     if(h->s.current_picture_ptr)
3130         h->s.current_picture_ptr->reference= 0;
3131     h->s.first_field= 0;
3132     ff_mpeg_flush(avctx);
3133 }
3134
3135 /**
3136  * Find a Picture in the short term reference list by frame number.
3137  * @param frame_num frame number to search for
3138  * @param idx the index into h->short_ref where returned picture is found
3139  *            undefined if no picture found.
3140  * @return pointer to the found picture, or NULL if no pic with the provided
3141  *                 frame number is found
3142  */
3143 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3144     MpegEncContext * const s = &h->s;
3145     int i;
3146
3147     for(i=0; i<h->short_ref_count; i++){
3148         Picture *pic= h->short_ref[i];
3149         if(s->avctx->debug&FF_DEBUG_MMCO)
3150             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3151         if(pic->frame_num == frame_num) {
3152             *idx = i;
3153             return pic;
3154         }
3155     }
3156     return NULL;
3157 }
3158
3159 /**
3160  * Remove a picture from the short term reference list by its index in
3161  * that list.  This does no checking on the provided index; it is assumed
3162  * to be valid. Other list entries are shifted down.
3163  * @param i index into h->short_ref of picture to remove.
3164  */
3165 static void remove_short_at_index(H264Context *h, int i){
3166     assert(i >= 0 && i < h->short_ref_count);
3167     h->short_ref[i]= NULL;
3168     if (--h->short_ref_count)
3169         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3170 }
3171
3172 /**
3173  *
3174  * @return the removed picture or NULL if an error occurs
3175  */
3176 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3177     MpegEncContext * const s = &h->s;
3178     Picture *pic;
3179     int i;
3180
3181     if(s->avctx->debug&FF_DEBUG_MMCO)
3182         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3183
3184     pic = find_short(h, frame_num, &i);
3185     if (pic){
3186         if(unreference_pic(h, pic, ref_mask))
3187         remove_short_at_index(h, i);
3188     }
3189
3190     return pic;
3191 }
3192
3193 /**
3194  * Remove a picture from the long term reference list by its index in
3195  * that list.
3196  * @return the removed picture or NULL if an error occurs
3197  */
3198 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3199     Picture *pic;
3200
3201     pic= h->long_ref[i];
3202     if (pic){
3203         if(unreference_pic(h, pic, ref_mask)){
3204             assert(h->long_ref[i]->long_ref == 1);
3205             h->long_ref[i]->long_ref= 0;
3206             h->long_ref[i]= NULL;
3207             h->long_ref_count--;
3208         }
3209     }
3210
3211     return pic;
3212 }
3213
3214 /**
3215  * print short term list
3216  */
3217 static void print_short_term(H264Context *h) {
3218     uint32_t i;
3219     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3220         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3221         for(i=0; i<h->short_ref_count; i++){
3222             Picture *pic= h->short_ref[i];
3223             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3224         }
3225     }
3226 }
3227
3228 /**
3229  * print long term list
3230  */
3231 static void print_long_term(H264Context *h) {
3232     uint32_t i;
3233     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3234         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3235         for(i = 0; i < 16; i++){
3236             Picture *pic= h->long_ref[i];
3237             if (pic) {
3238                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3239             }
3240         }
3241     }
3242 }
3243
3244 /**
3245  * Executes the reference picture marking (memory management control operations).
3246  */
3247 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3248     MpegEncContext * const s = &h->s;
3249     int i, j;
3250     int current_ref_assigned=0;
3251     Picture *pic;
3252
3253     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3254         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3255
3256     for(i=0; i<mmco_count; i++){
3257         int structure, frame_num;
3258         if(s->avctx->debug&FF_DEBUG_MMCO)
3259             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3260
3261         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3262            || mmco[i].opcode == MMCO_SHORT2LONG){
3263             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3264             pic = find_short(h, frame_num, &j);
3265             if(!pic){
3266                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3267                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3268                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3269                 continue;
3270             }
3271         }
3272
3273         switch(mmco[i].opcode){
3274         case MMCO_SHORT2UNUSED:
3275             if(s->avctx->debug&FF_DEBUG_MMCO)
3276                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3277             remove_short(h, frame_num, structure ^ PICT_FRAME);
3278             break;
3279         case MMCO_SHORT2LONG:
3280                 if (h->long_ref[mmco[i].long_arg] != pic)
3281                     remove_long(h, mmco[i].long_arg, 0);
3282
3283                 remove_short_at_index(h, j);
3284                 h->long_ref[ mmco[i].long_arg ]= pic;
3285                 if (h->long_ref[ mmco[i].long_arg ]){
3286                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3287                     h->long_ref_count++;
3288                 }
3289             break;
3290         case MMCO_LONG2UNUSED:
3291             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3292             pic = h->long_ref[j];
3293             if (pic) {
3294                 remove_long(h, j, structure ^ PICT_FRAME);
3295             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3296                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3297             break;
3298         case MMCO_LONG:
3299                     // Comment below left from previous code as it is an interresting note.
3300                     /* First field in pair is in short term list or
3301                      * at a different long term index.
3302                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3303                      * Report the problem and keep the pair where it is,
3304                      * and mark this field valid.
3305                      */
3306
3307             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3308                 remove_long(h, mmco[i].long_arg, 0);
3309
3310                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3311                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3312                 h->long_ref_count++;
3313             }
3314
3315             s->current_picture_ptr->reference |= s->picture_structure;
3316             current_ref_assigned=1;
3317             break;
3318         case MMCO_SET_MAX_LONG:
3319             assert(mmco[i].long_arg <= 16);
3320             // just remove the long term which index is greater than new max
3321             for(j = mmco[i].long_arg; j<16; j++){
3322                 remove_long(h, j, 0);
3323             }
3324             break;
3325         case MMCO_RESET:
3326             while(h->short_ref_count){
3327                 remove_short(h, h->short_ref[0]->frame_num, 0);
3328             }
3329             for(j = 0; j < 16; j++) {
3330                 remove_long(h, j, 0);
3331             }
3332             s->current_picture_ptr->poc=
3333             s->current_picture_ptr->field_poc[0]=
3334             s->current_picture_ptr->field_poc[1]=
3335             h->poc_lsb=
3336             h->poc_msb=
3337             h->frame_num=
3338             s->current_picture_ptr->frame_num= 0;
3339             break;
3340         default: assert(0);
3341         }
3342     }
3343
3344     if (!current_ref_assigned) {
3345         /* Second field of complementary field pair; the first field of
3346          * which is already referenced. If short referenced, it
3347          * should be first entry in short_ref. If not, it must exist
3348          * in long_ref; trying to put it on the short list here is an
3349          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3350          */
3351         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3352             /* Just mark the second field valid */
3353             s->current_picture_ptr->reference = PICT_FRAME;
3354         } else if (s->current_picture_ptr->long_ref) {
3355             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3356                                              "assignment for second field "
3357                                              "in complementary field pair "
3358                                              "(first field is long term)\n");
3359         } else {
3360             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3361             if(pic){
3362                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3363             }
3364
3365             if(h->short_ref_count)
3366                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3367
3368             h->short_ref[0]= s->current_picture_ptr;
3369             h->short_ref_count++;
3370             s->current_picture_ptr->reference |= s->picture_structure;
3371         }
3372     }
3373
3374     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3375
3376         /* We have too many reference frames, probably due to corrupted
3377          * stream. Need to discard one frame. Prevents overrun of the
3378          * short_ref and long_ref buffers.
3379          */
3380         av_log(h->s.avctx, AV_LOG_ERROR,
3381                "number of reference frames exceeds max (probably "
3382                "corrupt input), discarding one\n");
3383
3384         if (h->long_ref_count && !h->short_ref_count) {
3385             for (i = 0; i < 16; ++i)
3386                 if (h->long_ref[i])
3387                     break;
3388
3389             assert(i < 16);
3390             remove_long(h, i, 0);
3391         } else {
3392             pic = h->short_ref[h->short_ref_count - 1];
3393             remove_short(h, pic->frame_num, 0);
3394         }
3395     }
3396
3397     print_short_term(h);
3398     print_long_term(h);
3399     return 0;
3400 }
3401
3402 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3403     MpegEncContext * const s = &h->s;
3404     int i;
3405
3406     h->mmco_index= 0;
3407     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3408         s->broken_link= get_bits1(gb) -1;
3409         if(get_bits1(gb)){
3410             h->mmco[0].opcode= MMCO_LONG;
3411             h->mmco[0].long_arg= 0;
3412             h->mmco_index= 1;
3413         }
3414     }else{
3415         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3416             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3417                 MMCOOpcode opcode= get_ue_golomb(gb);
3418
3419                 h->mmco[i].opcode= opcode;
3420                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3421                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3422 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3423                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3424                         return -1;
3425                     }*/
3426                 }
3427                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3428                     unsigned int long_arg= get_ue_golomb(gb);
3429                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3430                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3431                         return -1;
3432                     }
3433                     h->mmco[i].long_arg= long_arg;
3434                 }
3435
3436                 if(opcode > (unsigned)MMCO_LONG){
3437                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3438                     return -1;
3439                 }
3440                 if(opcode == MMCO_END)
3441                     break;
3442             }
3443             h->mmco_index= i;
3444         }else{
3445             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3446
3447             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3448                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3449                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3450                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3451                 h->mmco_index= 1;
3452                 if (FIELD_PICTURE) {
3453                     h->mmco[0].short_pic_num *= 2;
3454                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3455                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3456                     h->mmco_index= 2;
3457                 }
3458             }
3459         }
3460     }
3461
3462     return 0;
3463 }
3464
3465 static int init_poc(H264Context *h){
3466     MpegEncContext * const s = &h->s;
3467     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3468     int field_poc[2];
3469     Picture *cur = s->current_picture_ptr;
3470
3471     h->frame_num_offset= h->prev_frame_num_offset;
3472     if(h->frame_num < h->prev_frame_num)
3473         h->frame_num_offset += max_frame_num;
3474
3475     if(h->sps.poc_type==0){
3476         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3477
3478         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3479             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3480         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3481             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3482         else
3483             h->poc_msb = h->prev_poc_msb;
3484 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3485         field_poc[0] =
3486         field_poc[1] = h->poc_msb + h->poc_lsb;
3487         if(s->picture_structure == PICT_FRAME)
3488             field_poc[1] += h->delta_poc_bottom;
3489     }else if(h->sps.poc_type==1){
3490         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3491         int i;
3492
3493         if(h->sps.poc_cycle_length != 0)
3494             abs_frame_num = h->frame_num_offset + h->frame_num;
3495         else
3496             abs_frame_num = 0;
3497
3498         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3499             abs_frame_num--;
3500
3501         expected_delta_per_poc_cycle = 0;
3502         for(i=0; i < h->sps.poc_cycle_length; i++)
3503             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3504
3505         if(abs_frame_num > 0){
3506             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3507             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3508
3509             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3510             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3511                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3512         } else
3513             expectedpoc = 0;
3514
3515         if(h->nal_ref_idc == 0)
3516             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3517
3518         field_poc[0] = expectedpoc + h->delta_poc[0];
3519         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3520
3521         if(s->picture_structure == PICT_FRAME)
3522             field_poc[1] += h->delta_poc[1];
3523     }else{
3524         int poc= 2*(h->frame_num_offset + h->frame_num);
3525
3526         if(!h->nal_ref_idc)
3527             poc--;
3528
3529         field_poc[0]= poc;
3530         field_poc[1]= poc;
3531     }
3532
3533     if(s->picture_structure != PICT_BOTTOM_FIELD)
3534         s->current_picture_ptr->field_poc[0]= field_poc[0];
3535     if(s->picture_structure != PICT_TOP_FIELD)
3536         s->current_picture_ptr->field_poc[1]= field_poc[1];
3537     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3538
3539     return 0;
3540 }
3541
3542
3543 /**
3544  * initialize scan tables
3545  */
3546 static void init_scan_tables(H264Context *h){
3547     MpegEncContext * const s = &h->s;
3548     int i;
3549     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3550         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3551         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3552     }else{
3553         for(i=0; i<16; i++){
3554 #define T(x) (x>>2) | ((x<<2) & 0xF)
3555             h->zigzag_scan[i] = T(zigzag_scan[i]);
3556             h-> field_scan[i] = T( field_scan[i]);
3557 #undef T
3558         }
3559     }
3560     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3561         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3562         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3563         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3564         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3565     }else{
3566         for(i=0; i<64; i++){
3567 #define T(x) (x>>3) | ((x&7)<<3)
3568             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3569             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3570             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3571             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3572 #undef T
3573         }
3574     }
3575     if(h->sps.transform_bypass){ //FIXME same ugly
3576         h->zigzag_scan_q0          = zigzag_scan;
3577         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3578         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3579         h->field_scan_q0           = field_scan;
3580         h->field_scan8x8_q0        = field_scan8x8;
3581         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3582     }else{
3583         h->zigzag_scan_q0          = h->zigzag_scan;
3584         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3585         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3586         h->field_scan_q0           = h->field_scan;
3587         h->field_scan8x8_q0        = h->field_scan8x8;
3588         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3589     }
3590 }
3591
3592 /**
3593  * Replicates H264 "master" context to thread contexts.
3594  */
3595 static void clone_slice(H264Context *dst, H264Context *src)
3596 {
3597     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3598     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3599     dst->s.current_picture      = src->s.current_picture;
3600     dst->s.linesize             = src->s.linesize;
3601     dst->s.uvlinesize           = src->s.uvlinesize;
3602     dst->s.first_field          = src->s.first_field;
3603
3604     dst->prev_poc_msb           = src->prev_poc_msb;
3605     dst->prev_poc_lsb           = src->prev_poc_lsb;
3606     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3607     dst->prev_frame_num         = src->prev_frame_num;
3608     dst->short_ref_count        = src->short_ref_count;
3609
3610     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3611     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3612     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3613     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3614
3615     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3616     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3617 }
3618
3619 /**
3620  * decodes a slice header.
3621  * This will also call MPV_common_init() and frame_start() as needed.
3622  *
3623  * @param h h264context
3624  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3625  *
3626  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3627  */
3628 static int decode_slice_header(H264Context *h, H264Context *h0){
3629     MpegEncContext * const s = &h->s;
3630     MpegEncContext * const s0 = &h0->s;
3631     unsigned int first_mb_in_slice;
3632     unsigned int pps_id;
3633     int num_ref_idx_active_override_flag;
3634     unsigned int slice_type, tmp, i, j;
3635     int default_ref_list_done = 0;
3636     int last_pic_structure;
3637
3638     s->dropable= h->nal_ref_idc == 0;
3639
3640     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3641         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3642         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3643     }else{
3644         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3645         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3646     }
3647
3648     first_mb_in_slice= get_ue_golomb(&s->gb);
3649
3650     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3651         h0->current_slice = 0;
3652         if (!s0->first_field)
3653             s->current_picture_ptr= NULL;
3654     }
3655
3656     slice_type= get_ue_golomb(&s->gb);
3657     if(slice_type > 9){
3658         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3659         return -1;
3660     }
3661     if(slice_type > 4){
3662         slice_type -= 5;
3663         h->slice_type_fixed=1;
3664     }else
3665         h->slice_type_fixed=0;
3666
3667     slice_type= golomb_to_pict_type[ slice_type ];
3668     if (slice_type == FF_I_TYPE
3669         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3670         default_ref_list_done = 1;
3671     }
3672     h->slice_type= slice_type;
3673     h->slice_type_nos= slice_type & 3;
3674
3675     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3676     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3677         av_log(h->s.avctx, AV_LOG_ERROR,
3678                "B picture before any references, skipping\n");
3679         return -1;
3680     }
3681
3682     pps_id= get_ue_golomb(&s->gb);
3683     if(pps_id>=MAX_PPS_COUNT){
3684         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3685         return -1;
3686     }
3687     if(!h0->pps_buffers[pps_id]) {
3688         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3689         return -1;
3690     }
3691     h->pps= *h0->pps_buffers[pps_id];
3692
3693     if(!h0->sps_buffers[h->pps.sps_id]) {
3694         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3695         return -1;
3696     }
3697     h->sps = *h0->sps_buffers[h->pps.sps_id];
3698
3699     if(h == h0 && h->dequant_coeff_pps != pps_id){
3700         h->dequant_coeff_pps = pps_id;
3701         init_dequant_tables(h);
3702     }
3703
3704     s->mb_width= h->sps.mb_width;
3705     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3706
3707     h->b_stride=  s->mb_width*4;
3708     h->b8_stride= s->mb_width*2;
3709
3710     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3711     if(h->sps.frame_mbs_only_flag)
3712         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3713     else
3714         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3715
3716     if (s->context_initialized
3717         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3718         if(h != h0)
3719             return -1;   // width / height changed during parallelized decoding
3720         free_tables(h);
3721         MPV_common_end(s);
3722     }
3723     if (!s->context_initialized) {
3724         if(h != h0)
3725             return -1;  // we cant (re-)initialize context during parallel decoding
3726         if (MPV_common_init(s) < 0)
3727             return -1;
3728         s->first_field = 0;
3729
3730         init_scan_tables(h);
3731         alloc_tables(h);
3732
3733         for(i = 1; i < s->avctx->thread_count; i++) {
3734             H264Context *c;
3735             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3736             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3737             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3738             c->sps = h->sps;
3739             c->pps = h->pps;
3740             init_scan_tables(c);
3741             clone_tables(c, h);
3742         }
3743
3744         for(i = 0; i < s->avctx->thread_count; i++)
3745             if(context_init(h->thread_context[i]) < 0)
3746                 return -1;
3747
3748         s->avctx->width = s->width;
3749         s->avctx->height = s->height;
3750         s->avctx->sample_aspect_ratio= h->sps.sar;
3751         if(!s->avctx->sample_aspect_ratio.den)
3752             s->avctx->sample_aspect_ratio.den = 1;
3753
3754         if(h->sps.timing_info_present_flag){
3755             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3756             if(h->x264_build > 0 && h->x264_build < 44)
3757                 s->avctx->time_base.den *= 2;
3758             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3759                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3760         }
3761     }
3762
3763     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3764
3765     h->mb_mbaff = 0;
3766     h->mb_aff_frame = 0;
3767     last_pic_structure = s0->picture_structure;
3768     if(h->sps.frame_mbs_only_flag){
3769         s->picture_structure= PICT_FRAME;
3770     }else{
3771         if(get_bits1(&s->gb)) { //field_pic_flag
3772             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3773         } else {
3774             s->picture_structure= PICT_FRAME;
3775             h->mb_aff_frame = h->sps.mb_aff;
3776         }
3777     }
3778     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3779
3780     if(h0->current_slice == 0){
3781         while(h->frame_num !=  h->prev_frame_num &&
3782               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3783             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3784             frame_start(h);
3785             h->prev_frame_num++;
3786             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3787             s->current_picture_ptr->frame_num= h->prev_frame_num;
3788             execute_ref_pic_marking(h, NULL, 0);
3789         }
3790
3791         /* See if we have a decoded first field looking for a pair... */
3792         if (s0->first_field) {
3793             assert(s0->current_picture_ptr);
3794             assert(s0->current_picture_ptr->data[0]);
3795             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3796
3797             /* figure out if we have a complementary field pair */
3798             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3799                 /*
3800                  * Previous field is unmatched. Don't display it, but let it
3801                  * remain for reference if marked as such.
3802                  */
3803                 s0->current_picture_ptr = NULL;
3804                 s0->first_field = FIELD_PICTURE;
3805
3806             } else {
3807                 if (h->nal_ref_idc &&
3808                         s0->current_picture_ptr->reference &&
3809                         s0->current_picture_ptr->frame_num != h->frame_num) {
3810                     /*
3811                      * This and previous field were reference, but had
3812                      * different frame_nums. Consider this field first in
3813                      * pair. Throw away previous field except for reference
3814                      * purposes.
3815                      */
3816                     s0->first_field = 1;
3817                     s0->current_picture_ptr = NULL;
3818
3819                 } else {
3820                     /* Second field in complementary pair */
3821                     s0->first_field = 0;
3822                 }
3823             }
3824
3825         } else {
3826             /* Frame or first field in a potentially complementary pair */
3827             assert(!s0->current_picture_ptr);
3828             s0->first_field = FIELD_PICTURE;
3829         }
3830
3831         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3832             s0->first_field = 0;
3833             return -1;
3834         }
3835     }
3836     if(h != h0)
3837         clone_slice(h, h0);
3838
3839     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3840
3841     assert(s->mb_num == s->mb_width * s->mb_height);
3842     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3843        first_mb_in_slice                    >= s->mb_num){
3844         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3845         return -1;
3846     }
3847     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3848     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3849     if (s->picture_structure == PICT_BOTTOM_FIELD)
3850         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3851     assert(s->mb_y < s->mb_height);
3852
3853     if(s->picture_structure==PICT_FRAME){
3854         h->curr_pic_num=   h->frame_num;
3855         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3856     }else{
3857         h->curr_pic_num= 2*h->frame_num + 1;
3858         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3859     }
3860
3861     if(h->nal_unit_type == NAL_IDR_SLICE){
3862         get_ue_golomb(&s->gb); /* idr_pic_id */
3863     }
3864
3865     if(h->sps.poc_type==0){
3866         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3867
3868         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3869             h->delta_poc_bottom= get_se_golomb(&s->gb);
3870         }
3871     }
3872
3873     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3874         h->delta_poc[0]= get_se_golomb(&s->gb);
3875
3876         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3877             h->delta_poc[1]= get_se_golomb(&s->gb);
3878     }
3879
3880     init_poc(h);
3881
3882     if(h->pps.redundant_pic_cnt_present){
3883         h->redundant_pic_count= get_ue_golomb(&s->gb);
3884     }
3885
3886     //set defaults, might be overridden a few lines later
3887     h->ref_count[0]= h->pps.ref_count[0];
3888     h->ref_count[1]= h->pps.ref_count[1];
3889
3890     if(h->slice_type_nos != FF_I_TYPE){
3891         if(h->slice_type_nos == FF_B_TYPE){
3892             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3893         }
3894         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3895
3896         if(num_ref_idx_active_override_flag){
3897             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3898             if(h->slice_type_nos==FF_B_TYPE)
3899                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3900
3901             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3902                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3903                 h->ref_count[0]= h->ref_count[1]= 1;
3904                 return -1;
3905             }
3906         }
3907         if(h->slice_type_nos == FF_B_TYPE)
3908             h->list_count= 2;
3909         else
3910             h->list_count= 1;
3911     }else
3912         h->list_count= 0;
3913
3914     if(!default_ref_list_done){
3915         fill_default_ref_list(h);
3916     }
3917
3918     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3919         return -1;
3920
3921     if(h->slice_type_nos!=FF_I_TYPE){
3922         s->last_picture_ptr= &h->ref_list[0][0];
3923         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3924     }
3925     if(h->slice_type_nos==FF_B_TYPE){
3926         s->next_picture_ptr= &h->ref_list[1][0];
3927         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3928     }
3929
3930     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3931        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3932         pred_weight_table(h);
3933     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3934         implicit_weight_table(h);
3935     else
3936         h->use_weight = 0;
3937
3938     if(h->nal_ref_idc)
3939         decode_ref_pic_marking(h0, &s->gb);
3940
3941     if(FRAME_MBAFF)
3942         fill_mbaff_ref_list(h);
3943
3944     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3945         direct_dist_scale_factor(h);
3946     direct_ref_list_init(h);
3947
3948     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3949         tmp = get_ue_golomb(&s->gb);
3950         if(tmp > 2){
3951             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3952             return -1;
3953         }
3954         h->cabac_init_idc= tmp;
3955     }
3956
3957     h->last_qscale_diff = 0;
3958     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3959     if(tmp>51){
3960         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3961         return -1;
3962     }
3963     s->qscale= tmp;
3964     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3965     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3966     //FIXME qscale / qp ... stuff
3967     if(h->slice_type == FF_SP_TYPE){
3968         get_bits1(&s->gb); /* sp_for_switch_flag */
3969     }
3970     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3971         get_se_golomb(&s->gb); /* slice_qs_delta */
3972     }
3973
3974     h->deblocking_filter = 1;
3975     h->slice_alpha_c0_offset = 0;
3976     h->slice_beta_offset = 0;
3977     if( h->pps.deblocking_filter_parameters_present ) {
3978         tmp= get_ue_golomb(&s->gb);
3979         if(tmp > 2){
3980             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3981             return -1;
3982         }
3983         h->deblocking_filter= tmp;
3984         if(h->deblocking_filter < 2)
3985             h->deblocking_filter^= 1; // 1<->0
3986
3987         if( h->deblocking_filter ) {
3988             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3989             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3990         }
3991     }
3992
3993     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3994        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3995        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3996        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3997         h->deblocking_filter= 0;
3998
3999     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4000         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4001             /* Cheat slightly for speed:
4002                Do not bother to deblock across slices. */
4003             h->deblocking_filter = 2;
4004         } else {
4005             h0->max_contexts = 1;
4006             if(!h0->single_decode_warning) {
4007                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4008                 h0->single_decode_warning = 1;
4009             }
4010             if(h != h0)
4011                 return 1; // deblocking switched inside frame
4012         }
4013     }
4014
4015 #if 0 //FMO
4016     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4017         slice_group_change_cycle= get_bits(&s->gb, ?);
4018 #endif
4019
4020     h0->last_slice_type = slice_type;
4021     h->slice_num = ++h0->current_slice;
4022     if(h->slice_num >= MAX_SLICES){
4023         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4024     }
4025
4026     for(j=0; j<2; j++){
4027         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4028         ref2frm[0]=
4029         ref2frm[1]= -1;
4030         for(i=0; i<16; i++)
4031             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4032                           +(h->ref_list[j][i].reference&3);
4033         ref2frm[18+0]=
4034         ref2frm[18+1]= -1;
4035         for(i=16; i<48; i++)
4036             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4037                           +(h->ref_list[j][i].reference&3);
4038     }
4039
4040     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4041     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4042
4043     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4044         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4045                h->slice_num,
4046                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4047                first_mb_in_slice,
4048                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4049                pps_id, h->frame_num,
4050                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4051                h->ref_count[0], h->ref_count[1],
4052                s->qscale,
4053                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4054                h->use_weight,
4055                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4056                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4057                );
4058     }
4059
4060     return 0;
4061 }
4062
4063 /**
4064  *
4065  */
4066 static inline int get_level_prefix(GetBitContext *gb){
4067     unsigned int buf;
4068     int log;
4069
4070     OPEN_READER(re, gb);
4071     UPDATE_CACHE(re, gb);
4072     buf=GET_CACHE(re, gb);
4073
4074     log= 32 - av_log2(buf);
4075 #ifdef TRACE
4076     print_bin(buf>>(32-log), log);
4077     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4078 #endif
4079
4080     LAST_SKIP_BITS(re, gb, log);
4081     CLOSE_READER(re, gb);
4082
4083     return log-1;
4084 }
4085
4086 static inline int get_dct8x8_allowed(H264Context *h){
4087     int i;
4088     for(i=0; i<4; i++){
4089         if(!IS_SUB_8X8(h->sub_mb_type[i])
4090            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4091             return 0;
4092     }
4093     return 1;
4094 }
4095
4096 /**
4097  * decodes a residual block.
4098  * @param n block index
4099  * @param scantable scantable
4100  * @param max_coeff number of coefficients in the block
4101  * @return <0 if an error occurred
4102  */
4103 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4104     MpegEncContext * const s = &h->s;
4105     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4106     int level[16];
4107     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4108
4109     //FIXME put trailing_onex into the context
4110
4111     if(n == CHROMA_DC_BLOCK_INDEX){
4112         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4113         total_coeff= coeff_token>>2;
4114     }else{
4115         if(n == LUMA_DC_BLOCK_INDEX){
4116             total_coeff= pred_non_zero_count(h, 0);
4117             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4118             total_coeff= coeff_token>>2;
4119         }else{
4120             total_coeff= pred_non_zero_count(h, n);
4121             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4122             total_coeff= coeff_token>>2;
4123             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4124         }
4125     }
4126
4127     //FIXME set last_non_zero?
4128
4129     if(total_coeff==0)
4130         return 0;
4131     if(total_coeff > (unsigned)max_coeff) {
4132         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4133         return -1;
4134     }
4135
4136     trailing_ones= coeff_token&3;
4137     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4138     assert(total_coeff<=16);
4139
4140     for(i=0; i<trailing_ones; i++){
4141         level[i]= 1 - 2*get_bits1(gb);
4142     }
4143
4144     if(i<total_coeff) {
4145         int level_code, mask;
4146         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4147         int prefix= get_level_prefix(gb);
4148
4149         //first coefficient has suffix_length equal to 0 or 1
4150         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4151             if(suffix_length)
4152                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4153             else
4154                 level_code= (prefix<<suffix_length); //part
4155         }else if(prefix==14){
4156             if(suffix_length)
4157                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4158             else
4159                 level_code= prefix + get_bits(gb, 4); //part
4160         }else{
4161             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4162             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4163             if(prefix>=16)
4164                 level_code += (1<<(prefix-3))-4096;
4165         }
4166
4167         if(trailing_ones < 3) level_code += 2;
4168
4169         suffix_length = 1;
4170         if(level_code > 5)
4171             suffix_length++;
4172         mask= -(level_code&1);
4173         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4174         i++;
4175
4176         //remaining coefficients have suffix_length > 0
4177         for(;i<total_coeff;i++) {
4178             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4179             prefix = get_level_prefix(gb);
4180             if(prefix<15){
4181                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4182             }else{
4183                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4184                 if(prefix>=16)
4185                     level_code += (1<<(prefix-3))-4096;
4186             }
4187             mask= -(level_code&1);
4188             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4189             if(level_code > suffix_limit[suffix_length])
4190                 suffix_length++;
4191         }
4192     }
4193
4194     if(total_coeff == max_coeff)
4195         zeros_left=0;
4196     else{
4197         if(n == CHROMA_DC_BLOCK_INDEX)
4198             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4199         else
4200             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4201     }
4202
4203     coeff_num = zeros_left + total_coeff - 1;
4204     j = scantable[coeff_num];
4205     if(n > 24){
4206         block[j] = level[0];
4207         for(i=1;i<total_coeff;i++) {
4208             if(zeros_left <= 0)
4209                 run_before = 0;
4210             else if(zeros_left < 7){
4211                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4212             }else{
4213                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4214             }
4215             zeros_left -= run_before;
4216             coeff_num -= 1 + run_before;
4217             j= scantable[ coeff_num ];
4218
4219             block[j]= level[i];
4220         }
4221     }else{
4222         block[j] = (level[0] * qmul[j] + 32)>>6;
4223         for(i=1;i<total_coeff;i++) {
4224             if(zeros_left <= 0)
4225                 run_before = 0;
4226             else if(zeros_left < 7){
4227                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4228             }else{
4229                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4230             }
4231             zeros_left -= run_before;
4232             coeff_num -= 1 + run_before;
4233             j= scantable[ coeff_num ];
4234
4235             block[j]= (level[i] * qmul[j] + 32)>>6;
4236         }
4237     }
4238
4239     if(zeros_left<0){
4240         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4241         return -1;
4242     }
4243
4244     return 0;
4245 }
4246
4247 static void predict_field_decoding_flag(H264Context *h){
4248     MpegEncContext * const s = &h->s;
4249     const int mb_xy= h->mb_xy;
4250     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4251                 ? s->current_picture.mb_type[mb_xy-1]
4252                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4253                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4254                 : 0;
4255     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4256 }
4257
4258 /**
4259  * decodes a P_SKIP or B_SKIP macroblock
4260  */
4261 static void decode_mb_skip(H264Context *h){
4262     MpegEncContext * const s = &h->s;
4263     const int mb_xy= h->mb_xy;
4264     int mb_type=0;
4265
4266     memset(h->non_zero_count[mb_xy], 0, 16);
4267     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4268
4269     if(MB_FIELD)
4270         mb_type|= MB_TYPE_INTERLACED;
4271
4272     if( h->slice_type_nos == FF_B_TYPE )
4273     {
4274         // just for fill_caches. pred_direct_motion will set the real mb_type
4275         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4276
4277         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4278         pred_direct_motion(h, &mb_type);
4279         mb_type|= MB_TYPE_SKIP;
4280     }
4281     else
4282     {
4283         int mx, my;
4284         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4285
4286         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4287         pred_pskip_motion(h, &mx, &my);
4288         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4289         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4290     }
4291
4292     write_back_motion(h, mb_type);
4293     s->current_picture.mb_type[mb_xy]= mb_type;
4294     s->current_picture.qscale_table[mb_xy]= s->qscale;
4295     h->slice_table[ mb_xy ]= h->slice_num;
4296     h->prev_mb_skipped= 1;
4297 }
4298
4299 /**
4300  * decodes a macroblock
4301  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4302  */
4303 static int decode_mb_cavlc(H264Context *h){
4304     MpegEncContext * const s = &h->s;
4305     int mb_xy;
4306     int partition_count;
4307     unsigned int mb_type, cbp;
4308     int dct8x8_allowed= h->pps.transform_8x8_mode;
4309
4310     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4311
4312     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4313
4314     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4315     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4316                 down the code */
4317     if(h->slice_type_nos != FF_I_TYPE){
4318         if(s->mb_skip_run==-1)
4319             s->mb_skip_run= get_ue_golomb(&s->gb);
4320
4321         if (s->mb_skip_run--) {
4322             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4323                 if(s->mb_skip_run==0)
4324                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4325                 else
4326                     predict_field_decoding_flag(h);
4327             }
4328             decode_mb_skip(h);
4329             return 0;
4330         }
4331     }
4332     if(FRAME_MBAFF){
4333         if( (s->mb_y&1) == 0 )
4334             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4335     }
4336
4337     h->prev_mb_skipped= 0;
4338
4339     mb_type= get_ue_golomb(&s->gb);
4340     if(h->slice_type_nos == FF_B_TYPE){
4341         if(mb_type < 23){
4342             partition_count= b_mb_type_info[mb_type].partition_count;
4343             mb_type=         b_mb_type_info[mb_type].type;
4344         }else{
4345             mb_type -= 23;
4346             goto decode_intra_mb;
4347         }
4348     }else if(h->slice_type_nos == FF_P_TYPE){
4349         if(mb_type < 5){
4350             partition_count= p_mb_type_info[mb_type].partition_count;
4351             mb_type=         p_mb_type_info[mb_type].type;
4352         }else{
4353             mb_type -= 5;
4354             goto decode_intra_mb;
4355         }
4356     }else{
4357        assert(h->slice_type_nos == FF_I_TYPE);
4358         if(h->slice_type == FF_SI_TYPE && mb_type)
4359             mb_type--;
4360 decode_intra_mb:
4361         if(mb_type > 25){
4362             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4363             return -1;
4364         }
4365         partition_count=0;
4366         cbp= i_mb_type_info[mb_type].cbp;
4367         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4368         mb_type= i_mb_type_info[mb_type].type;
4369     }
4370
4371     if(MB_FIELD)
4372         mb_type |= MB_TYPE_INTERLACED;
4373
4374     h->slice_table[ mb_xy ]= h->slice_num;
4375
4376     if(IS_INTRA_PCM(mb_type)){
4377         unsigned int x;
4378
4379         // We assume these blocks are very rare so we do not optimize it.
4380         align_get_bits(&s->gb);
4381
4382         // The pixels are stored in the same order as levels in h->mb array.
4383         for(x=0; x < (CHROMA ? 384 : 256); x++){
4384             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4385         }
4386
4387         // In deblocking, the quantizer is 0
4388         s->current_picture.qscale_table[mb_xy]= 0;
4389         // All coeffs are present
4390         memset(h->non_zero_count[mb_xy], 16, 16);
4391
4392         s->current_picture.mb_type[mb_xy]= mb_type;
4393         return 0;
4394     }
4395
4396     if(MB_MBAFF){
4397         h->ref_count[0] <<= 1;
4398         h->ref_count[1] <<= 1;
4399     }
4400
4401     fill_caches(h, mb_type, 0);
4402
4403     //mb_pred
4404     if(IS_INTRA(mb_type)){
4405         int pred_mode;
4406 //            init_top_left_availability(h);
4407         if(IS_INTRA4x4(mb_type)){
4408             int i;
4409             int di = 1;
4410             if(dct8x8_allowed && get_bits1(&s->gb)){
4411                 mb_type |= MB_TYPE_8x8DCT;
4412                 di = 4;
4413             }
4414
4415 //                fill_intra4x4_pred_table(h);
4416             for(i=0; i<16; i+=di){
4417                 int mode= pred_intra_mode(h, i);
4418
4419                 if(!get_bits1(&s->gb)){
4420                     const int rem_mode= get_bits(&s->gb, 3);
4421                     mode = rem_mode + (rem_mode >= mode);
4422                 }
4423
4424                 if(di==4)
4425                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4426                 else
4427                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4428             }
4429             write_back_intra_pred_mode(h);
4430             if( check_intra4x4_pred_mode(h) < 0)
4431                 return -1;
4432         }else{
4433             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4434             if(h->intra16x16_pred_mode < 0)
4435                 return -1;
4436         }
4437         if(CHROMA){
4438             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4439             if(pred_mode < 0)
4440                 return -1;
4441             h->chroma_pred_mode= pred_mode;
4442         }
4443     }else if(partition_count==4){
4444         int i, j, sub_partition_count[4], list, ref[2][4];
4445
4446         if(h->slice_type_nos == FF_B_TYPE){
4447             for(i=0; i<4; i++){
4448                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4449                 if(h->sub_mb_type[i] >=13){
4450                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4451                     return -1;
4452                 }
4453                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4454                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4455             }
4456             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4457                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4458                 pred_direct_motion(h, &mb_type);
4459                 h->ref_cache[0][scan8[4]] =
4460                 h->ref_cache[1][scan8[4]] =
4461                 h->ref_cache[0][scan8[12]] =
4462                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4463             }
4464         }else{
4465             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4466             for(i=0; i<4; i++){
4467                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4468                 if(h->sub_mb_type[i] >=4){
4469                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4470                     return -1;
4471                 }
4472                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4473                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4474             }
4475         }
4476
4477         for(list=0; list<h->list_count; list++){
4478             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4479             for(i=0; i<4; i++){
4480                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4481                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4482                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4483                     if(tmp>=ref_count){
4484                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4485                         return -1;
4486                     }
4487                     ref[list][i]= tmp;
4488                 }else{
4489                  //FIXME
4490                     ref[list][i] = -1;
4491                 }
4492             }
4493         }
4494
4495         if(dct8x8_allowed)
4496             dct8x8_allowed = get_dct8x8_allowed(h);
4497
4498         for(list=0; list<h->list_count; list++){
4499             for(i=0; i<4; i++){
4500                 if(IS_DIRECT(h->sub_mb_type[i])) {
4501                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4502                     continue;
4503                 }
4504                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4505                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4506
4507                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4508                     const int sub_mb_type= h->sub_mb_type[i];
4509                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4510                     for(j=0; j<sub_partition_count[i]; j++){
4511                         int mx, my;
4512                         const int index= 4*i + block_width*j;
4513                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4514                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4515                         mx += get_se_golomb(&s->gb);
4516                         my += get_se_golomb(&s->gb);
4517                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4518
4519                         if(IS_SUB_8X8(sub_mb_type)){
4520                             mv_cache[ 1 ][0]=
4521                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4522                             mv_cache[ 1 ][1]=
4523                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4524                         }else if(IS_SUB_8X4(sub_mb_type)){
4525                             mv_cache[ 1 ][0]= mx;
4526                             mv_cache[ 1 ][1]= my;
4527                         }else if(IS_SUB_4X8(sub_mb_type)){
4528                             mv_cache[ 8 ][0]= mx;
4529                             mv_cache[ 8 ][1]= my;
4530                         }
4531                         mv_cache[ 0 ][0]= mx;
4532                         mv_cache[ 0 ][1]= my;
4533                     }
4534                 }else{
4535                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4536                     p[0] = p[1]=
4537                     p[8] = p[9]= 0;
4538                 }
4539             }
4540         }
4541     }else if(IS_DIRECT(mb_type)){
4542         pred_direct_motion(h, &mb_type);
4543         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4544     }else{
4545         int list, mx, my, i;
4546          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4547         if(IS_16X16(mb_type)){
4548             for(list=0; list<h->list_count; list++){
4549                     unsigned int val;
4550                     if(IS_DIR(mb_type, 0, list)){
4551                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4552                         if(val >= h->ref_count[list]){
4553                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4554                             return -1;
4555                         }
4556                     }else
4557                         val= LIST_NOT_USED&0xFF;
4558                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4559             }
4560             for(list=0; list<h->list_count; list++){
4561                 unsigned int val;
4562                 if(IS_DIR(mb_type, 0, list)){
4563                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4564                     mx += get_se_golomb(&s->gb);
4565                     my += get_se_golomb(&s->gb);
4566                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4567
4568                     val= pack16to32(mx,my);
4569                 }else
4570                     val=0;
4571                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4572             }
4573         }
4574         else if(IS_16X8(mb_type)){
4575             for(list=0; list<h->list_count; list++){
4576                     for(i=0; i<2; i++){
4577                         unsigned int val;
4578                         if(IS_DIR(mb_type, i, list)){
4579                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4580                             if(val >= h->ref_count[list]){
4581                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4582                                 return -1;
4583                             }
4584                         }else
4585                             val= LIST_NOT_USED&0xFF;
4586                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4587                     }
4588             }
4589             for(list=0; list<h->list_count; list++){
4590                 for(i=0; i<2; i++){
4591                     unsigned int val;
4592                     if(IS_DIR(mb_type, i, list)){
4593                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4594                         mx += get_se_golomb(&s->gb);
4595                         my += get_se_golomb(&s->gb);
4596                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4597
4598                         val= pack16to32(mx,my);
4599                     }else
4600                         val=0;
4601                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4602                 }
4603             }
4604         }else{
4605             assert(IS_8X16(mb_type));
4606             for(list=0; list<h->list_count; list++){
4607                     for(i=0; i<2; i++){
4608                         unsigned int val;
4609                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4610                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4611                             if(val >= h->ref_count[list]){
4612                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4613                                 return -1;
4614                             }
4615                         }else
4616                             val= LIST_NOT_USED&0xFF;
4617                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4618                     }
4619             }
4620             for(list=0; list<h->list_count; list++){
4621                 for(i=0; i<2; i++){
4622                     unsigned int val;
4623                     if(IS_DIR(mb_type, i, list)){
4624                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4625                         mx += get_se_golomb(&s->gb);
4626                         my += get_se_golomb(&s->gb);
4627                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4628
4629                         val= pack16to32(mx,my);
4630                     }else
4631                         val=0;
4632                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4633                 }
4634             }
4635         }
4636     }
4637
4638     if(IS_INTER(mb_type))
4639         write_back_motion(h, mb_type);
4640
4641     if(!IS_INTRA16x16(mb_type)){
4642         cbp= get_ue_golomb(&s->gb);
4643         if(cbp > 47){
4644             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4645             return -1;
4646         }
4647
4648         if(CHROMA){
4649             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4650             else                     cbp= golomb_to_inter_cbp   [cbp];
4651         }else{
4652             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4653             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4654         }
4655     }
4656     h->cbp = cbp;
4657
4658     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4659         if(get_bits1(&s->gb)){
4660             mb_type |= MB_TYPE_8x8DCT;
4661             h->cbp_table[mb_xy]= cbp;
4662         }
4663     }
4664     s->current_picture.mb_type[mb_xy]= mb_type;
4665
4666     if(cbp || IS_INTRA16x16(mb_type)){
4667         int i8x8, i4x4, chroma_idx;
4668         int dquant;
4669         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4670         const uint8_t *scan, *scan8x8, *dc_scan;
4671
4672 //        fill_non_zero_count_cache(h);
4673
4674         if(IS_INTERLACED(mb_type)){
4675             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4676             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4677             dc_scan= luma_dc_field_scan;
4678         }else{
4679             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4680             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4681             dc_scan= luma_dc_zigzag_scan;
4682         }
4683
4684         dquant= get_se_golomb(&s->gb);
4685
4686         if( dquant > 25 || dquant < -26 ){
4687             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4688             return -1;
4689         }
4690
4691         s->qscale += dquant;
4692         if(((unsigned)s->qscale) > 51){
4693             if(s->qscale<0) s->qscale+= 52;
4694             else            s->qscale-= 52;
4695         }
4696
4697         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4698         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4699         if(IS_INTRA16x16(mb_type)){
4700             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4701                 return -1; //FIXME continue if partitioned and other return -1 too
4702             }
4703
4704             assert((cbp&15) == 0 || (cbp&15) == 15);
4705
4706             if(cbp&15){
4707                 for(i8x8=0; i8x8<4; i8x8++){
4708                     for(i4x4=0; i4x4<4; i4x4++){
4709                         const int index= i4x4 + 4*i8x8;
4710                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4711                             return -1;
4712                         }
4713                     }
4714                 }
4715             }else{
4716                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4717             }
4718         }else{
4719             for(i8x8=0; i8x8<4; i8x8++){
4720                 if(cbp & (1<<i8x8)){
4721                     if(IS_8x8DCT(mb_type)){
4722                         DCTELEM *buf = &h->mb[64*i8x8];
4723                         uint8_t *nnz;
4724                         for(i4x4=0; i4x4<4; i4x4++){
4725                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4726                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4727                                 return -1;
4728                         }
4729                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4730                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4731                     }else{
4732                         for(i4x4=0; i4x4<4; i4x4++){
4733                             const int index= i4x4 + 4*i8x8;
4734
4735                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4736                                 return -1;
4737                             }
4738                         }
4739                     }
4740                 }else{
4741                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4742                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4743                 }
4744             }
4745         }
4746
4747         if(cbp&0x30){
4748             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4749                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4750                     return -1;
4751                 }
4752         }
4753
4754         if(cbp&0x20){
4755             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4756                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4757                 for(i4x4=0; i4x4<4; i4x4++){
4758                     const int index= 16 + 4*chroma_idx + i4x4;
4759                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4760                         return -1;
4761                     }
4762                 }
4763             }
4764         }else{
4765             uint8_t * const nnz= &h->non_zero_count_cache[0];
4766             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4767             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4768         }
4769     }else{
4770         uint8_t * const nnz= &h->non_zero_count_cache[0];
4771         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4772         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4773         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4774     }
4775     s->current_picture.qscale_table[mb_xy]= s->qscale;
4776     write_back_non_zero_count(h);
4777
4778     if(MB_MBAFF){
4779         h->ref_count[0] >>= 1;
4780         h->ref_count[1] >>= 1;
4781     }
4782
4783     return 0;
4784 }
4785
4786 static int decode_cabac_field_decoding_flag(H264Context *h) {
4787     MpegEncContext * const s = &h->s;
4788     const int mb_x = s->mb_x;
4789     const int mb_y = s->mb_y & ~1;
4790     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4791     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4792
4793     unsigned int ctx = 0;
4794
4795     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4796         ctx += 1;
4797     }
4798     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4799         ctx += 1;
4800     }
4801
4802     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4803 }
4804
4805 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4806     uint8_t *state= &h->cabac_state[ctx_base];
4807     int mb_type;
4808
4809     if(intra_slice){
4810         MpegEncContext * const s = &h->s;
4811         const int mba_xy = h->left_mb_xy[0];
4812         const int mbb_xy = h->top_mb_xy;
4813         int ctx=0;
4814         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4815             ctx++;
4816         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4817             ctx++;
4818         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4819             return 0;   /* I4x4 */
4820         state += 2;
4821     }else{
4822         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4823             return 0;   /* I4x4 */
4824     }
4825
4826     if( get_cabac_terminate( &h->cabac ) )
4827         return 25;  /* PCM */
4828
4829     mb_type = 1; /* I16x16 */
4830     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4831     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4832         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4833     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4834     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4835     return mb_type;
4836 }
4837
4838 static int decode_cabac_mb_type( H264Context *h ) {
4839     MpegEncContext * const s = &h->s;
4840
4841     if( h->slice_type_nos == FF_I_TYPE ) {
4842         return decode_cabac_intra_mb_type(h, 3, 1);
4843     } else if( h->slice_type_nos == FF_P_TYPE ) {
4844         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4845             /* P-type */
4846             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4847                 /* P_L0_D16x16, P_8x8 */
4848                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4849             } else {
4850                 /* P_L0_D8x16, P_L0_D16x8 */
4851                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4852             }
4853         } else {
4854             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4855         }
4856     } else if( h->slice_type_nos == FF_B_TYPE ) {
4857         const int mba_xy = h->left_mb_xy[0];
4858         const int mbb_xy = h->top_mb_xy;
4859         int ctx = 0;
4860         int bits;
4861
4862         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4863             ctx++;
4864         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4865             ctx++;
4866
4867         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4868             return 0; /* B_Direct_16x16 */
4869
4870         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4871             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4872         }
4873
4874         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4875         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4876         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4877         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4878         if( bits < 8 )
4879             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4880         else if( bits == 13 ) {
4881             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4882         } else if( bits == 14 )
4883             return 11; /* B_L1_L0_8x16 */
4884         else if( bits == 15 )
4885             return 22; /* B_8x8 */
4886
4887         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4888         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4889     } else {
4890         /* TODO SI/SP frames? */
4891         return -1;
4892     }
4893 }
4894
4895 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4896     MpegEncContext * const s = &h->s;
4897     int mba_xy, mbb_xy;
4898     int ctx = 0;
4899
4900     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4901         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4902         mba_xy = mb_xy - 1;
4903         if( (mb_y&1)
4904             && h->slice_table[mba_xy] == h->slice_num
4905             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4906             mba_xy += s->mb_stride;
4907         if( MB_FIELD ){
4908             mbb_xy = mb_xy - s->mb_stride;
4909             if( !(mb_y&1)
4910                 && h->slice_table[mbb_xy] == h->slice_num
4911                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4912                 mbb_xy -= s->mb_stride;
4913         }else
4914             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4915     }else{
4916         int mb_xy = h->mb_xy;
4917         mba_xy = mb_xy - 1;
4918         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4919     }
4920
4921     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4922         ctx++;
4923     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4924         ctx++;
4925
4926     if( h->slice_type_nos == FF_B_TYPE )
4927         ctx += 13;
4928     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4929 }
4930
4931 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4932     int mode = 0;
4933
4934     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4935         return pred_mode;
4936
4937     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4938     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4939     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4940
4941     if( mode >= pred_mode )
4942         return mode + 1;
4943     else
4944         return mode;
4945 }
4946
4947 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4948     const int mba_xy = h->left_mb_xy[0];
4949     const int mbb_xy = h->top_mb_xy;
4950
4951     int ctx = 0;
4952
4953     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4954     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4955         ctx++;
4956
4957     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4958         ctx++;
4959
4960     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4961         return 0;
4962
4963     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4964         return 1;
4965     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4966         return 2;
4967     else
4968         return 3;
4969 }
4970
4971 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4972     int cbp_b, cbp_a, ctx, cbp = 0;
4973
4974     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4975     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4976
4977     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4978     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4979     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4980     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4981     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4982     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4983     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4984     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4985     return cbp;
4986 }
4987 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4988     int ctx;
4989     int cbp_a, cbp_b;
4990
4991     cbp_a = (h->left_cbp>>4)&0x03;
4992     cbp_b = (h-> top_cbp>>4)&0x03;
4993
4994     ctx = 0;
4995     if( cbp_a > 0 ) ctx++;
4996     if( cbp_b > 0 ) ctx += 2;
4997     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4998         return 0;
4999
5000     ctx = 4;
5001     if( cbp_a == 2 ) ctx++;
5002     if( cbp_b == 2 ) ctx += 2;
5003     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5004 }
5005 static int decode_cabac_mb_dqp( H264Context *h) {
5006     int   ctx = 0;
5007     int   val = 0;
5008
5009     if( h->last_qscale_diff != 0 )
5010         ctx++;
5011
5012     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5013         if( ctx < 2 )
5014             ctx = 2;
5015         else
5016             ctx = 3;
5017         val++;
5018         if(val > 102) //prevent infinite loop
5019             return INT_MIN;
5020     }
5021
5022     if( val&0x01 )
5023         return (val + 1)/2;
5024     else
5025         return -(val + 1)/2;
5026 }
5027 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5028     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5029         return 0;   /* 8x8 */
5030     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5031         return 1;   /* 8x4 */
5032     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5033         return 2;   /* 4x8 */
5034     return 3;       /* 4x4 */
5035 }
5036 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5037     int type;
5038     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5039         return 0;   /* B_Direct_8x8 */
5040     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5041         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5042     type = 3;
5043     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5044         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5045             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5046         type += 4;
5047     }
5048     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5049     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5050     return type;
5051 }
5052
5053 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5054     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5055 }
5056
5057 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5058     int refa = h->ref_cache[list][scan8[n] - 1];
5059     int refb = h->ref_cache[list][scan8[n] - 8];
5060     int ref  = 0;
5061     int ctx  = 0;
5062
5063     if( h->slice_type_nos == FF_B_TYPE) {
5064         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5065             ctx++;
5066         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5067             ctx += 2;
5068     } else {
5069         if( refa > 0 )
5070             ctx++;
5071         if( refb > 0 )
5072             ctx += 2;
5073     }
5074
5075     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5076         ref++;
5077         if( ctx < 4 )
5078             ctx = 4;
5079         else
5080             ctx = 5;
5081         if(ref >= 32 /*h->ref_list[list]*/){
5082             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5083             return 0; //FIXME we should return -1 and check the return everywhere
5084         }
5085     }
5086     return ref;
5087 }
5088
5089 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5090     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5091                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5092     int ctxbase = (l == 0) ? 40 : 47;
5093     int ctx, mvd;
5094
5095     if( amvd < 3 )
5096         ctx = 0;
5097     else if( amvd > 32 )
5098         ctx = 2;
5099     else
5100         ctx = 1;
5101
5102     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5103         return 0;
5104
5105     mvd= 1;
5106     ctx= 3;
5107     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5108         mvd++;
5109         if( ctx < 6 )
5110             ctx++;
5111     }
5112
5113     if( mvd >= 9 ) {
5114         int k = 3;
5115         while( get_cabac_bypass( &h->cabac ) ) {
5116             mvd += 1 << k;
5117             k++;
5118             if(k>24){
5119                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5120                 return INT_MIN;
5121             }
5122         }
5123         while( k-- ) {
5124             if( get_cabac_bypass( &h->cabac ) )
5125                 mvd += 1 << k;
5126         }
5127     }
5128     return get_cabac_bypass_sign( &h->cabac, -mvd );
5129 }
5130
5131 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5132     int nza, nzb;
5133     int ctx = 0;
5134
5135     if( is_dc ) {
5136         if( cat == 0 ) {
5137             nza = h->left_cbp&0x100;
5138             nzb = h-> top_cbp&0x100;
5139         } else {
5140             nza = (h->left_cbp>>(6+idx))&0x01;
5141             nzb = (h-> top_cbp>>(6+idx))&0x01;
5142         }
5143     } else {
5144         if( cat == 4 ) {
5145             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5146             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5147         } else {
5148             assert(cat == 1 || cat == 2);
5149             nza = h->non_zero_count_cache[scan8[idx] - 1];
5150             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5151         }
5152     }
5153
5154     if( nza > 0 )
5155         ctx++;
5156
5157     if( nzb > 0 )
5158         ctx += 2;
5159
5160     return ctx + 4 * cat;
5161 }
5162
5163 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5164     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5165     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5166     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5167     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5168 };
5169
5170 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5171     static const int significant_coeff_flag_offset[2][6] = {
5172       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5173       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5174     };
5175     static const int last_coeff_flag_offset[2][6] = {
5176       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5177       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5178     };
5179     static const int coeff_abs_level_m1_offset[6] = {
5180         227+0, 227+10, 227+20, 227+30, 227+39, 426
5181     };
5182     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5183       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5184         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5185         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5186        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5187       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5188         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5189         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5190         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5191     };
5192     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5193      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5194      * map node ctx => cabac ctx for level=1 */
5195     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5196     /* map node ctx => cabac ctx for level>1 */
5197     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5198     static const uint8_t coeff_abs_level_transition[2][8] = {
5199     /* update node ctx after decoding a level=1 */
5200         { 1, 2, 3, 3, 4, 5, 6, 7 },
5201     /* update node ctx after decoding a level>1 */
5202         { 4, 4, 4, 4, 5, 6, 7, 7 }
5203     };
5204
5205     int index[64];
5206
5207     int av_unused last;
5208     int coeff_count = 0;
5209     int node_ctx = 0;
5210
5211     uint8_t *significant_coeff_ctx_base;
5212     uint8_t *last_coeff_ctx_base;
5213     uint8_t *abs_level_m1_ctx_base;
5214
5215 #ifndef ARCH_X86
5216 #define CABAC_ON_STACK
5217 #endif
5218 #ifdef CABAC_ON_STACK
5219 #define CC &cc
5220     CABACContext cc;
5221     cc.range     = h->cabac.range;
5222     cc.low       = h->cabac.low;
5223     cc.bytestream= h->cabac.bytestream;
5224 #else
5225 #define CC &h->cabac
5226 #endif
5227
5228
5229     /* cat: 0-> DC 16x16  n = 0
5230      *      1-> AC 16x16  n = luma4x4idx
5231      *      2-> Luma4x4   n = luma4x4idx
5232      *      3-> DC Chroma n = iCbCr
5233      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5234      *      5-> Luma8x8   n = 4 * luma8x8idx
5235      */
5236
5237     /* read coded block flag */
5238     if( is_dc || cat != 5 ) {
5239         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5240             if( !is_dc ) {
5241                 if( cat == 4 )
5242                     h->non_zero_count_cache[scan8[16+n]] = 0;
5243                 else
5244                     h->non_zero_count_cache[scan8[n]] = 0;
5245             }
5246
5247 #ifdef CABAC_ON_STACK
5248             h->cabac.range     = cc.range     ;
5249             h->cabac.low       = cc.low       ;
5250             h->cabac.bytestream= cc.bytestream;
5251 #endif
5252             return;
5253         }
5254     }
5255
5256     significant_coeff_ctx_base = h->cabac_state
5257         + significant_coeff_flag_offset[MB_FIELD][cat];
5258     last_coeff_ctx_base = h->cabac_state
5259         + last_coeff_flag_offset[MB_FIELD][cat];
5260     abs_level_m1_ctx_base = h->cabac_state
5261         + coeff_abs_level_m1_offset[cat];
5262
5263     if( !is_dc && cat == 5 ) {
5264 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5265         for(last= 0; last < coefs; last++) { \
5266             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5267             if( get_cabac( CC, sig_ctx )) { \
5268                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5269                 index[coeff_count++] = last; \
5270                 if( get_cabac( CC, last_ctx ) ) { \
5271                     last= max_coeff; \
5272                     break; \
5273                 } \
5274             } \
5275         }\
5276         if( last == max_coeff -1 ) {\
5277             index[coeff_count++] = last;\
5278         }
5279         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5280 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5281         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5282     } else {
5283         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5284 #else
5285         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5286     } else {
5287         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5288 #endif
5289     }
5290     assert(coeff_count > 0);
5291
5292     if( is_dc ) {
5293         if( cat == 0 )
5294             h->cbp_table[h->mb_xy] |= 0x100;
5295         else
5296             h->cbp_table[h->mb_xy] |= 0x40 << n;
5297     } else {
5298         if( cat == 5 )
5299             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5300         else if( cat == 4 )
5301             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5302         else {
5303             assert( cat == 1 || cat == 2 );
5304             h->non_zero_count_cache[scan8[n]] = coeff_count;
5305         }
5306     }
5307
5308     do {
5309         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5310
5311         int j= scantable[index[--coeff_count]];
5312
5313         if( get_cabac( CC, ctx ) == 0 ) {
5314             node_ctx = coeff_abs_level_transition[0][node_ctx];
5315             if( is_dc ) {
5316                 block[j] = get_cabac_bypass_sign( CC, -1);
5317             }else{
5318                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5319             }
5320         } else {
5321             int coeff_abs = 2;
5322             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5323             node_ctx = coeff_abs_level_transition[1][node_ctx];
5324
5325             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5326                 coeff_abs++;
5327             }
5328
5329             if( coeff_abs >= 15 ) {
5330                 int j = 0;
5331                 while( get_cabac_bypass( CC ) ) {
5332                     j++;
5333                 }
5334
5335                 coeff_abs=1;
5336                 while( j-- ) {
5337                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5338                 }
5339                 coeff_abs+= 14;
5340             }
5341
5342             if( is_dc ) {
5343                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5344             }else{
5345                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5346             }
5347         }
5348     } while( coeff_count );
5349 #ifdef CABAC_ON_STACK
5350             h->cabac.range     = cc.range     ;
5351             h->cabac.low       = cc.low       ;
5352             h->cabac.bytestream= cc.bytestream;
5353 #endif
5354
5355 }
5356
5357 #ifndef CONFIG_SMALL
5358 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5359     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5360 }
5361
5362 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5363     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5364 }
5365 #endif
5366
5367 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5368 #ifdef CONFIG_SMALL
5369     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5370 #else
5371     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5372     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5373 #endif
5374 }
5375
5376 static inline void compute_mb_neighbors(H264Context *h)
5377 {
5378     MpegEncContext * const s = &h->s;
5379     const int mb_xy  = h->mb_xy;
5380     h->top_mb_xy     = mb_xy - s->mb_stride;
5381     h->left_mb_xy[0] = mb_xy - 1;
5382     if(FRAME_MBAFF){
5383         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5384         const int top_pair_xy      = pair_xy     - s->mb_stride;
5385         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5386         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5387         const int curr_mb_frame_flag = !MB_FIELD;
5388         const int bottom = (s->mb_y & 1);
5389         if (bottom
5390                 ? !curr_mb_frame_flag // bottom macroblock
5391                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5392                 ) {
5393             h->top_mb_xy -= s->mb_stride;
5394         }
5395         if (left_mb_frame_flag != curr_mb_frame_flag) {
5396             h->left_mb_xy[0] = pair_xy - 1;
5397         }
5398     } else if (FIELD_PICTURE) {
5399         h->top_mb_xy -= s->mb_stride;
5400     }
5401     return;
5402 }
5403
5404 /**
5405  * decodes a macroblock
5406  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5407  */
5408 static int decode_mb_cabac(H264Context *h) {
5409     MpegEncContext * const s = &h->s;
5410     int mb_xy;
5411     int mb_type, partition_count, cbp = 0;
5412     int dct8x8_allowed= h->pps.transform_8x8_mode;
5413
5414     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5415
5416     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5417
5418     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5419     if( h->slice_type_nos != FF_I_TYPE ) {
5420         int skip;
5421         /* a skipped mb needs the aff flag from the following mb */
5422         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5423             predict_field_decoding_flag(h);
5424         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5425             skip = h->next_mb_skipped;
5426         else
5427             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5428         /* read skip flags */
5429         if( skip ) {
5430             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5431                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5432                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5433                 if(h->next_mb_skipped)
5434                     predict_field_decoding_flag(h);
5435                 else
5436                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5437             }
5438
5439             decode_mb_skip(h);
5440
5441             h->cbp_table[mb_xy] = 0;
5442             h->chroma_pred_mode_table[mb_xy] = 0;
5443             h->last_qscale_diff = 0;
5444
5445             return 0;
5446
5447         }
5448     }
5449     if(FRAME_MBAFF){
5450         if( (s->mb_y&1) == 0 )
5451             h->mb_mbaff =
5452             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5453     }
5454
5455     h->prev_mb_skipped = 0;
5456
5457     compute_mb_neighbors(h);
5458     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5459         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5460         return -1;
5461     }
5462
5463     if( h->slice_type_nos == FF_B_TYPE ) {
5464         if( mb_type < 23 ){
5465             partition_count= b_mb_type_info[mb_type].partition_count;
5466             mb_type=         b_mb_type_info[mb_type].type;
5467         }else{
5468             mb_type -= 23;
5469             goto decode_intra_mb;
5470         }
5471     } else if( h->slice_type_nos == FF_P_TYPE ) {
5472         if( mb_type < 5) {
5473             partition_count= p_mb_type_info[mb_type].partition_count;
5474             mb_type=         p_mb_type_info[mb_type].type;
5475         } else {
5476             mb_type -= 5;
5477             goto decode_intra_mb;
5478         }
5479     } else {
5480         if(h->slice_type == FF_SI_TYPE && mb_type)
5481             mb_type--;
5482         assert(h->slice_type_nos == FF_I_TYPE);
5483 decode_intra_mb:
5484         partition_count = 0;
5485         cbp= i_mb_type_info[mb_type].cbp;
5486         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5487         mb_type= i_mb_type_info[mb_type].type;
5488     }
5489     if(MB_FIELD)
5490         mb_type |= MB_TYPE_INTERLACED;
5491
5492     h->slice_table[ mb_xy ]= h->slice_num;
5493
5494     if(IS_INTRA_PCM(mb_type)) {
5495         const uint8_t *ptr;
5496
5497         // We assume these blocks are very rare so we do not optimize it.
5498         // FIXME The two following lines get the bitstream position in the cabac
5499         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5500         ptr= h->cabac.bytestream;
5501         if(h->cabac.low&0x1) ptr--;
5502         if(CABAC_BITS==16){
5503             if(h->cabac.low&0x1FF) ptr--;
5504         }
5505
5506         // The pixels are stored in the same order as levels in h->mb array.
5507         memcpy(h->mb, ptr, 256); ptr+=256;
5508         if(CHROMA){
5509             memcpy(h->mb+128, ptr, 128); ptr+=128;
5510         }
5511
5512         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5513
5514         // All blocks are present
5515         h->cbp_table[mb_xy] = 0x1ef;
5516         h->chroma_pred_mode_table[mb_xy] = 0;
5517         // In deblocking, the quantizer is 0
5518         s->current_picture.qscale_table[mb_xy]= 0;
5519         // All coeffs are present
5520         memset(h->non_zero_count[mb_xy], 16, 16);
5521         s->current_picture.mb_type[mb_xy]= mb_type;
5522         h->last_qscale_diff = 0;
5523         return 0;
5524     }
5525
5526     if(MB_MBAFF){
5527         h->ref_count[0] <<= 1;
5528         h->ref_count[1] <<= 1;
5529     }
5530
5531     fill_caches(h, mb_type, 0);
5532
5533     if( IS_INTRA( mb_type ) ) {
5534         int i, pred_mode;
5535         if( IS_INTRA4x4( mb_type ) ) {
5536             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5537                 mb_type |= MB_TYPE_8x8DCT;
5538                 for( i = 0; i < 16; i+=4 ) {
5539                     int pred = pred_intra_mode( h, i );
5540                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5541                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5542                 }
5543             } else {
5544                 for( i = 0; i < 16; i++ ) {
5545                     int pred = pred_intra_mode( h, i );
5546                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5547
5548                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5549                 }
5550             }
5551             write_back_intra_pred_mode(h);
5552             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5553         } else {
5554             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5555             if( h->intra16x16_pred_mode < 0 ) return -1;
5556         }
5557         if(CHROMA){
5558             h->chroma_pred_mode_table[mb_xy] =
5559             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5560
5561             pred_mode= check_intra_pred_mode( h, pred_mode );
5562             if( pred_mode < 0 ) return -1;
5563             h->chroma_pred_mode= pred_mode;
5564         }
5565     } else if( partition_count == 4 ) {
5566         int i, j, sub_partition_count[4], list, ref[2][4];
5567
5568         if( h->slice_type_nos == FF_B_TYPE ) {
5569             for( i = 0; i < 4; i++ ) {
5570                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5571                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5572                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5573             }
5574             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5575                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5576                 pred_direct_motion(h, &mb_type);
5577                 h->ref_cache[0][scan8[4]] =
5578                 h->ref_cache[1][scan8[4]] =
5579                 h->ref_cache[0][scan8[12]] =
5580                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5581                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5582                     for( i = 0; i < 4; i++ )
5583                         if( IS_DIRECT(h->sub_mb_type[i]) )
5584                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5585                 }
5586             }
5587         } else {
5588             for( i = 0; i < 4; i++ ) {
5589                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5590                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5591                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5592             }
5593         }
5594
5595         for( list = 0; list < h->list_count; list++ ) {
5596                 for( i = 0; i < 4; i++ ) {
5597                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5598                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5599                         if( h->ref_count[list] > 1 )
5600                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5601                         else
5602                             ref[list][i] = 0;
5603                     } else {
5604                         ref[list][i] = -1;
5605                     }
5606                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5607                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5608                 }
5609         }
5610
5611         if(dct8x8_allowed)
5612             dct8x8_allowed = get_dct8x8_allowed(h);
5613
5614         for(list=0; list<h->list_count; list++){
5615             for(i=0; i<4; i++){
5616                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5617                 if(IS_DIRECT(h->sub_mb_type[i])){
5618                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5619                     continue;
5620                 }
5621
5622                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5623                     const int sub_mb_type= h->sub_mb_type[i];
5624                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5625                     for(j=0; j<sub_partition_count[i]; j++){
5626                         int mpx, mpy;
5627                         int mx, my;
5628                         const int index= 4*i + block_width*j;
5629                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5630                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5631                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5632
5633                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5634                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5635                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5636
5637                         if(IS_SUB_8X8(sub_mb_type)){
5638                             mv_cache[ 1 ][0]=
5639                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5640                             mv_cache[ 1 ][1]=
5641                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5642
5643                             mvd_cache[ 1 ][0]=
5644                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5645                             mvd_cache[ 1 ][1]=
5646                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5647                         }else if(IS_SUB_8X4(sub_mb_type)){
5648                             mv_cache[ 1 ][0]= mx;
5649                             mv_cache[ 1 ][1]= my;
5650
5651                             mvd_cache[ 1 ][0]= mx - mpx;
5652                             mvd_cache[ 1 ][1]= my - mpy;
5653                         }else if(IS_SUB_4X8(sub_mb_type)){
5654                             mv_cache[ 8 ][0]= mx;
5655                             mv_cache[ 8 ][1]= my;
5656
5657                             mvd_cache[ 8 ][0]= mx - mpx;
5658                             mvd_cache[ 8 ][1]= my - mpy;
5659                         }
5660                         mv_cache[ 0 ][0]= mx;
5661                         mv_cache[ 0 ][1]= my;
5662
5663                         mvd_cache[ 0 ][0]= mx - mpx;
5664                         mvd_cache[ 0 ][1]= my - mpy;
5665                     }
5666                 }else{
5667                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5668                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5669                     p[0] = p[1] = p[8] = p[9] = 0;
5670                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5671                 }
5672             }
5673         }
5674     } else if( IS_DIRECT(mb_type) ) {
5675         pred_direct_motion(h, &mb_type);
5676         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5677         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5678         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5679     } else {
5680         int list, mx, my, i, mpx, mpy;
5681         if(IS_16X16(mb_type)){
5682             for(list=0; list<h->list_count; list++){
5683                 if(IS_DIR(mb_type, 0, list)){
5684                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5685                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5686                 }else
5687                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5688             }
5689             for(list=0; list<h->list_count; list++){
5690                 if(IS_DIR(mb_type, 0, list)){
5691                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5692
5693                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5694                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5695                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5696
5697                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5698                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5699                 }else
5700                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5701             }
5702         }
5703         else if(IS_16X8(mb_type)){
5704             for(list=0; list<h->list_count; list++){
5705                     for(i=0; i<2; i++){
5706                         if(IS_DIR(mb_type, i, list)){
5707                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5708                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5709                         }else
5710                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5711                     }
5712             }
5713             for(list=0; list<h->list_count; list++){
5714                 for(i=0; i<2; i++){
5715                     if(IS_DIR(mb_type, i, list)){
5716                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5717                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5718                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5719                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5720
5721                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5722                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5723                     }else{
5724                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5725                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5726                     }
5727                 }
5728             }
5729         }else{
5730             assert(IS_8X16(mb_type));
5731             for(list=0; list<h->list_count; list++){
5732                     for(i=0; i<2; i++){
5733                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5734                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5735                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5736                         }else
5737                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5738                     }
5739             }
5740             for(list=0; list<h->list_count; list++){
5741                 for(i=0; i<2; i++){
5742                     if(IS_DIR(mb_type, i, list)){
5743                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5744                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5745                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5746
5747                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5748                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5749                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5750                     }else{
5751                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5752                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5753                     }
5754                 }
5755             }
5756         }
5757     }
5758
5759    if( IS_INTER( mb_type ) ) {
5760         h->chroma_pred_mode_table[mb_xy] = 0;
5761         write_back_motion( h, mb_type );
5762    }
5763
5764     if( !IS_INTRA16x16( mb_type ) ) {
5765         cbp  = decode_cabac_mb_cbp_luma( h );
5766         if(CHROMA)
5767             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5768     }
5769
5770     h->cbp_table[mb_xy] = h->cbp = cbp;
5771
5772     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5773         if( decode_cabac_mb_transform_size( h ) )
5774             mb_type |= MB_TYPE_8x8DCT;
5775     }
5776     s->current_picture.mb_type[mb_xy]= mb_type;
5777
5778     if( cbp || IS_INTRA16x16( mb_type ) ) {
5779         const uint8_t *scan, *scan8x8, *dc_scan;
5780         const uint32_t *qmul;
5781         int dqp;
5782
5783         if(IS_INTERLACED(mb_type)){
5784             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5785             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5786             dc_scan= luma_dc_field_scan;
5787         }else{
5788             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5789             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5790             dc_scan= luma_dc_zigzag_scan;
5791         }
5792
5793         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5794         if( dqp == INT_MIN ){
5795             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5796             return -1;
5797         }
5798         s->qscale += dqp;
5799         if(((unsigned)s->qscale) > 51){
5800             if(s->qscale<0) s->qscale+= 52;
5801             else            s->qscale-= 52;
5802         }
5803         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5804         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5805
5806         if( IS_INTRA16x16( mb_type ) ) {
5807             int i;
5808             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5809             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5810
5811             if( cbp&15 ) {
5812                 qmul = h->dequant4_coeff[0][s->qscale];
5813                 for( i = 0; i < 16; i++ ) {
5814                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5815                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5816                 }
5817             } else {
5818                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5819             }
5820         } else {
5821             int i8x8, i4x4;
5822             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5823                 if( cbp & (1<<i8x8) ) {
5824                     if( IS_8x8DCT(mb_type) ) {
5825                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5826                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5827                     } else {
5828                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5829                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5830                             const int index = 4*i8x8 + i4x4;
5831                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5832 //START_TIMER
5833                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5834 //STOP_TIMER("decode_residual")
5835                         }
5836                     }
5837                 } else {
5838                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5839                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5840                 }
5841             }
5842         }
5843
5844         if( cbp&0x30 ){
5845             int c;
5846             for( c = 0; c < 2; c++ ) {
5847                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5848                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5849             }
5850         }
5851
5852         if( cbp&0x20 ) {
5853             int c, i;
5854             for( c = 0; c < 2; c++ ) {
5855                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5856                 for( i = 0; i < 4; i++ ) {
5857                     const int index = 16 + 4 * c + i;
5858                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5859                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5860                 }
5861             }
5862         } else {
5863             uint8_t * const nnz= &h->non_zero_count_cache[0];
5864             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5865             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5866         }
5867     } else {
5868         uint8_t * const nnz= &h->non_zero_count_cache[0];
5869         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5870         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5871         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5872         h->last_qscale_diff = 0;
5873     }
5874
5875     s->current_picture.qscale_table[mb_xy]= s->qscale;
5876     write_back_non_zero_count(h);
5877
5878     if(MB_MBAFF){
5879         h->ref_count[0] >>= 1;
5880         h->ref_count[1] >>= 1;
5881     }
5882
5883     return 0;
5884 }
5885
5886
5887 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5888     int i, d;
5889     const int index_a = qp + h->slice_alpha_c0_offset;
5890     const int alpha = (alpha_table+52)[index_a];
5891     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5892
5893     if( bS[0] < 4 ) {
5894         int8_t tc[4];
5895         for(i=0; i<4; i++)
5896             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5897         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5898     } else {
5899         /* 16px edge length, because bS=4 is triggered by being at
5900          * the edge of an intra MB, so all 4 bS are the same */
5901             for( d = 0; d < 16; d++ ) {
5902                 const int p0 = pix[-1];
5903                 const int p1 = pix[-2];
5904                 const int p2 = pix[-3];
5905
5906                 const int q0 = pix[0];
5907                 const int q1 = pix[1];
5908                 const int q2 = pix[2];
5909
5910                 if( FFABS( p0 - q0 ) < alpha &&
5911                     FFABS( p1 - p0 ) < beta &&
5912                     FFABS( q1 - q0 ) < beta ) {
5913
5914                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5915                         if( FFABS( p2 - p0 ) < beta)
5916                         {
5917                             const int p3 = pix[-4];
5918                             /* p0', p1', p2' */
5919                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5920                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5921                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5922                         } else {
5923                             /* p0' */
5924                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5925                         }
5926                         if( FFABS( q2 - q0 ) < beta)
5927                         {
5928                             const int q3 = pix[3];
5929                             /* q0', q1', q2' */
5930                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5931                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5932                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5933                         } else {
5934                             /* q0' */
5935                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5936                         }
5937                     }else{
5938                         /* p0', q0' */
5939                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5940                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5941                     }
5942                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5943                 }
5944                 pix += stride;
5945             }
5946     }
5947 }
5948 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5949     int i;
5950     const int index_a = qp + h->slice_alpha_c0_offset;
5951     const int alpha = (alpha_table+52)[index_a];
5952     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5953
5954     if( bS[0] < 4 ) {
5955         int8_t tc[4];
5956         for(i=0; i<4; i++)
5957             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5958         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5959     } else {
5960         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5961     }
5962 }
5963
5964 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5965     int i;
5966     for( i = 0; i < 16; i++, pix += stride) {
5967         int index_a;
5968         int alpha;
5969         int beta;
5970
5971         int qp_index;
5972         int bS_index = (i >> 1);
5973         if (!MB_FIELD) {
5974             bS_index &= ~1;
5975             bS_index |= (i & 1);
5976         }
5977
5978         if( bS[bS_index] == 0 ) {
5979             continue;
5980         }
5981
5982         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5983         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5984         alpha = (alpha_table+52)[index_a];
5985         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5986
5987         if( bS[bS_index] < 4 ) {
5988             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5989             const int p0 = pix[-1];
5990             const int p1 = pix[-2];
5991             const int p2 = pix[-3];
5992             const int q0 = pix[0];
5993             const int q1 = pix[1];
5994             const int q2 = pix[2];
5995
5996             if( FFABS( p0 - q0 ) < alpha &&
5997                 FFABS( p1 - p0 ) < beta &&
5998                 FFABS( q1 - q0 ) < beta ) {
5999                 int tc = tc0;
6000                 int i_delta;
6001
6002                 if( FFABS( p2 - p0 ) < beta ) {
6003                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6004                     tc++;
6005                 }
6006                 if( FFABS( q2 - q0 ) < beta ) {
6007                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6008                     tc++;
6009                 }
6010
6011                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6012                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6013                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6014                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6015             }
6016         }else{
6017             const int p0 = pix[-1];
6018             const int p1 = pix[-2];
6019             const int p2 = pix[-3];
6020
6021             const int q0 = pix[0];
6022             const int q1 = pix[1];
6023             const int q2 = pix[2];
6024
6025             if( FFABS( p0 - q0 ) < alpha &&
6026                 FFABS( p1 - p0 ) < beta &&
6027                 FFABS( q1 - q0 ) < beta ) {
6028
6029                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6030                     if( FFABS( p2 - p0 ) < beta)
6031                     {
6032                         const int p3 = pix[-4];
6033                         /* p0', p1', p2' */
6034                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6035                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6036                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6037                     } else {
6038                         /* p0' */
6039                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6040                     }
6041                     if( FFABS( q2 - q0 ) < beta)
6042                     {
6043                         const int q3 = pix[3];
6044                         /* q0', q1', q2' */
6045                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6046                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6047                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6048                     } else {
6049                         /* q0' */
6050                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6051                     }
6052                 }else{
6053                     /* p0', q0' */
6054                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6055                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6056                 }
6057                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6058             }
6059         }
6060     }
6061 }
6062 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6063     int i;
6064     for( i = 0; i < 8; i++, pix += stride) {
6065         int index_a;
6066         int alpha;
6067         int beta;
6068
6069         int qp_index;
6070         int bS_index = i;
6071
6072         if( bS[bS_index] == 0 ) {
6073             continue;
6074         }
6075
6076         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6077         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6078         alpha = (alpha_table+52)[index_a];
6079         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6080
6081         if( bS[bS_index] < 4 ) {
6082             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6083             const int p0 = pix[-1];
6084             const int p1 = pix[-2];
6085             const int q0 = pix[0];
6086             const int q1 = pix[1];
6087
6088             if( FFABS( p0 - q0 ) < alpha &&
6089                 FFABS( p1 - p0 ) < beta &&
6090                 FFABS( q1 - q0 ) < beta ) {
6091                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6092
6093                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6094                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6095                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6096             }
6097         }else{
6098             const int p0 = pix[-1];
6099             const int p1 = pix[-2];
6100             const int q0 = pix[0];
6101             const int q1 = pix[1];
6102
6103             if( FFABS( p0 - q0 ) < alpha &&
6104                 FFABS( p1 - p0 ) < beta &&
6105                 FFABS( q1 - q0 ) < beta ) {
6106
6107                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6108                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6109                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6110             }
6111         }
6112     }
6113 }
6114
6115 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6116     int i, d;
6117     const int index_a = qp + h->slice_alpha_c0_offset;
6118     const int alpha = (alpha_table+52)[index_a];
6119     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6120     const int pix_next  = stride;
6121
6122     if( bS[0] < 4 ) {
6123         int8_t tc[4];
6124         for(i=0; i<4; i++)
6125             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6126         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6127     } else {
6128         /* 16px edge length, see filter_mb_edgev */
6129             for( d = 0; d < 16; d++ ) {
6130                 const int p0 = pix[-1*pix_next];
6131                 const int p1 = pix[-2*pix_next];
6132                 const int p2 = pix[-3*pix_next];
6133                 const int q0 = pix[0];
6134                 const int q1 = pix[1*pix_next];
6135                 const int q2 = pix[2*pix_next];
6136
6137                 if( FFABS( p0 - q0 ) < alpha &&
6138                     FFABS( p1 - p0 ) < beta &&
6139                     FFABS( q1 - q0 ) < beta ) {
6140
6141                     const int p3 = pix[-4*pix_next];
6142                     const int q3 = pix[ 3*pix_next];
6143
6144                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6145                         if( FFABS( p2 - p0 ) < beta) {
6146                             /* p0', p1', p2' */
6147                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6148                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6149                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6150                         } else {
6151                             /* p0' */
6152                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6153                         }
6154                         if( FFABS( q2 - q0 ) < beta) {
6155                             /* q0', q1', q2' */
6156                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6157                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6158                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6159                         } else {
6160                             /* q0' */
6161                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6162                         }
6163                     }else{
6164                         /* p0', q0' */
6165                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6166                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6167                     }
6168                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6169                 }
6170                 pix++;
6171             }
6172     }
6173 }
6174
6175 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6176     int i;
6177     const int index_a = qp + h->slice_alpha_c0_offset;
6178     const int alpha = (alpha_table+52)[index_a];
6179     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6180
6181     if( bS[0] < 4 ) {
6182         int8_t tc[4];
6183         for(i=0; i<4; i++)
6184             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6185         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6186     } else {
6187         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6188     }
6189 }
6190
6191 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6192     MpegEncContext * const s = &h->s;
6193     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6194     int mb_xy, mb_type;
6195     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6196
6197     mb_xy = h->mb_xy;
6198
6199     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6200 1 ||
6201        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6202                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6203         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6204         return;
6205     }
6206     assert(!FRAME_MBAFF);
6207
6208     mb_type = s->current_picture.mb_type[mb_xy];
6209     qp = s->current_picture.qscale_table[mb_xy];
6210     qp0 = s->current_picture.qscale_table[mb_xy-1];
6211     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6212     qpc = get_chroma_qp( h, 0, qp );
6213     qpc0 = get_chroma_qp( h, 0, qp0 );
6214     qpc1 = get_chroma_qp( h, 0, qp1 );
6215     qp0 = (qp + qp0 + 1) >> 1;
6216     qp1 = (qp + qp1 + 1) >> 1;
6217     qpc0 = (qpc + qpc0 + 1) >> 1;
6218     qpc1 = (qpc + qpc1 + 1) >> 1;
6219     qp_thresh = 15 - h->slice_alpha_c0_offset;
6220     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6221        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6222         return;
6223
6224     if( IS_INTRA(mb_type) ) {
6225         int16_t bS4[4] = {4,4,4,4};
6226         int16_t bS3[4] = {3,3,3,3};
6227         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6228         if( IS_8x8DCT(mb_type) ) {
6229             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6230             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6231             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6232             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6233         } else {
6234             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6235             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6236             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6237             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6238             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6239             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6240             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6241             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6242         }
6243         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6244         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6245         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6246         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6247         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6248         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6249         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6250         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6251         return;
6252     } else {
6253         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6254         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6255         int edges;
6256         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6257             edges = 4;
6258             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6259         } else {
6260             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6261                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6262             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6263                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6264                              ? 3 : 0;
6265             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6266             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6267             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6268                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6269         }
6270         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6271             bSv[0][0] = 0x0004000400040004ULL;
6272         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6273             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6274
6275 #define FILTER(hv,dir,edge)\
6276         if(bSv[dir][edge]) {\
6277             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6278             if(!(edge&1)) {\
6279                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6280                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6281             }\
6282         }
6283         if( edges == 1 ) {
6284             FILTER(v,0,0);
6285             FILTER(h,1,0);
6286         } else if( IS_8x8DCT(mb_type) ) {
6287             FILTER(v,0,0);
6288             FILTER(v,0,2);
6289             FILTER(h,1,0);
6290             FILTER(h,1,2);
6291         } else {
6292             FILTER(v,0,0);
6293             FILTER(v,0,1);
6294             FILTER(v,0,2);
6295             FILTER(v,0,3);
6296             FILTER(h,1,0);
6297             FILTER(h,1,1);
6298             FILTER(h,1,2);
6299             FILTER(h,1,3);
6300         }
6301 #undef FILTER
6302     }
6303 }
6304
6305 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6306     MpegEncContext * const s = &h->s;
6307     const int mb_xy= mb_x + mb_y*s->mb_stride;
6308     const int mb_type = s->current_picture.mb_type[mb_xy];
6309     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6310     int first_vertical_edge_done = 0;
6311     int dir;
6312
6313     //for sufficiently low qp, filtering wouldn't do anything
6314     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6315     if(!FRAME_MBAFF){
6316         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6317         int qp = s->current_picture.qscale_table[mb_xy];
6318         if(qp <= qp_thresh
6319            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6320            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6321             return;
6322         }
6323     }
6324
6325     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6326     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6327         int top_type, left_type[2];
6328         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6329         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6330         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6331
6332         if(IS_8x8DCT(top_type)){
6333             h->non_zero_count_cache[4+8*0]=
6334             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6335             h->non_zero_count_cache[6+8*0]=
6336             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6337         }
6338         if(IS_8x8DCT(left_type[0])){
6339             h->non_zero_count_cache[3+8*1]=
6340             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6341         }
6342         if(IS_8x8DCT(left_type[1])){
6343             h->non_zero_count_cache[3+8*3]=
6344             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6345         }
6346
6347         if(IS_8x8DCT(mb_type)){
6348             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6349             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6350
6351             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6352             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6353
6354             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6355             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6356
6357             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6358             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6359         }
6360     }
6361
6362     if (FRAME_MBAFF
6363             // left mb is in picture
6364             && h->slice_table[mb_xy-1] != 0xFFFF
6365             // and current and left pair do not have the same interlaced type
6366             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6367             // and left mb is in the same slice if deblocking_filter == 2
6368             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6369         /* First vertical edge is different in MBAFF frames
6370          * There are 8 different bS to compute and 2 different Qp
6371          */
6372         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6373         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6374         int16_t bS[8];
6375         int qp[2];
6376         int bqp[2];
6377         int rqp[2];
6378         int mb_qp, mbn0_qp, mbn1_qp;
6379         int i;
6380         first_vertical_edge_done = 1;
6381
6382         if( IS_INTRA(mb_type) )
6383             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6384         else {
6385             for( i = 0; i < 8; i++ ) {
6386                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6387
6388                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6389                     bS[i] = 4;
6390                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6391                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6392                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6393                                                                        :
6394                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6395                     bS[i] = 2;
6396                 else
6397                     bS[i] = 1;
6398             }
6399         }
6400
6401         mb_qp = s->current_picture.qscale_table[mb_xy];
6402         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6403         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6404         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6405         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6406                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6407         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6408                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6409         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6410         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6411                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6412         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6413                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6414
6415         /* Filter edge */
6416         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6417         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6418         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6419         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6420         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6421     }
6422     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6423     for( dir = 0; dir < 2; dir++ )
6424     {
6425         int edge;
6426         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6427         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6428         int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6429         int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6430         int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6431
6432         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6433                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6434         // how often to recheck mv-based bS when iterating between edges
6435         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6436                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6437         // how often to recheck mv-based bS when iterating along each edge
6438         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6439
6440         if (first_vertical_edge_done) {
6441             start = 1;
6442             first_vertical_edge_done = 0;
6443         }
6444
6445         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6446             start = 1;
6447
6448         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6449             && !IS_INTERLACED(mb_type)
6450             && IS_INTERLACED(mbm_type)
6451             ) {
6452             // This is a special case in the norm where the filtering must
6453             // be done twice (one each of the field) even if we are in a
6454             // frame macroblock.
6455             //
6456             static const int nnz_idx[4] = {4,5,6,3};
6457             unsigned int tmp_linesize   = 2 *   linesize;
6458             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6459             int mbn_xy = mb_xy - 2 * s->mb_stride;
6460             int qp;
6461             int i, j;
6462             int16_t bS[4];
6463
6464             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6465                 if( IS_INTRA(mb_type) ||
6466                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6467                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6468                 } else {
6469                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6470                     for( i = 0; i < 4; i++ ) {
6471                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6472                             mbn_nnz[nnz_idx[i]] != 0 )
6473                             bS[i] = 2;
6474                         else
6475                             bS[i] = 1;
6476                     }
6477                 }
6478                 // Do not use s->qscale as luma quantizer because it has not the same
6479                 // value in IPCM macroblocks.
6480                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6481                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6482                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6483                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6484                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6485                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6486                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6487                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6488             }
6489
6490             start = 1;
6491         }
6492
6493         /* Calculate bS */
6494         for( edge = start; edge < edges; edge++ ) {
6495             /* mbn_xy: neighbor macroblock */
6496             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6497             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6498             int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6499             int16_t bS[4];
6500             int qp;
6501
6502             if( (edge&1) && IS_8x8DCT(mb_type) )
6503                 continue;
6504
6505             if( IS_INTRA(mb_type) ||
6506                 IS_INTRA(mbn_type) ) {
6507                 int value;
6508                 if (edge == 0) {
6509                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6510                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6511                     ) {
6512                         value = 4;
6513                     } else {
6514                         value = 3;
6515                     }
6516                 } else {
6517                     value = 3;
6518                 }
6519                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6520             } else {
6521                 int i, l;
6522                 int mv_done;
6523
6524                 if( edge & mask_edge ) {
6525                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6526                     mv_done = 1;
6527                 }
6528                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6529                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6530                     mv_done = 1;
6531                 }
6532                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6533                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6534                     int bn_idx= b_idx - (dir ? 8:1);
6535                     int v = 0;
6536
6537                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6538                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6539                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6540                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6541                     }
6542
6543                     if(h->slice_type_nos == FF_B_TYPE && v){
6544                         v=0;
6545                         for( l = 0; !v && l < 2; l++ ) {
6546                             int ln= 1-l;
6547                             v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6548                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6549                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6550                         }
6551                     }
6552
6553                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6554                     mv_done = 1;
6555                 }
6556                 else
6557                     mv_done = 0;
6558
6559                 for( i = 0; i < 4; i++ ) {
6560                     int x = dir == 0 ? edge : i;
6561                     int y = dir == 0 ? i    : edge;
6562                     int b_idx= 8 + 4 + x + 8*y;
6563                     int bn_idx= b_idx - (dir ? 8:1);
6564
6565                     if( h->non_zero_count_cache[b_idx] != 0 ||
6566                         h->non_zero_count_cache[bn_idx] != 0 ) {
6567                         bS[i] = 2;
6568                     }
6569                     else if(!mv_done)
6570                     {
6571                         bS[i] = 0;
6572                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6573                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6574                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6575                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6576                                 bS[i] = 1;
6577                                 break;
6578                             }
6579                         }
6580
6581                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6582                             bS[i] = 0;
6583                             for( l = 0; l < 2; l++ ) {
6584                                 int ln= 1-l;
6585                                 if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6586                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6587                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6588                                     bS[i] = 1;
6589                                     break;
6590                                 }
6591                             }
6592                         }
6593                     }
6594                 }
6595
6596                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6597                     continue;
6598             }
6599
6600             /* Filter edge */
6601             // Do not use s->qscale as luma quantizer because it has not the same
6602             // value in IPCM macroblocks.
6603             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6604             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6605             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6606             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6607             if( dir == 0 ) {
6608                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6609                 if( (edge&1) == 0 ) {
6610                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6611                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6612                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6613                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6614                 }
6615             } else {
6616                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6617                 if( (edge&1) == 0 ) {
6618                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6619                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6620                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6621                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6622                 }
6623             }
6624         }
6625     }
6626 }
6627
6628 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6629     H264Context *h = *(void**)arg;
6630     MpegEncContext * const s = &h->s;
6631     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6632
6633     s->mb_skip_run= -1;
6634
6635     if( h->pps.cabac ) {
6636         int i;
6637
6638         /* realign */
6639         align_get_bits( &s->gb );
6640
6641         /* init cabac */
6642         ff_init_cabac_states( &h->cabac);
6643         ff_init_cabac_decoder( &h->cabac,
6644                                s->gb.buffer + get_bits_count(&s->gb)/8,
6645                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6646         /* calculate pre-state */
6647         for( i= 0; i < 460; i++ ) {
6648             int pre;
6649             if( h->slice_type_nos == FF_I_TYPE )
6650                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6651             else
6652                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6653
6654             if( pre <= 63 )
6655                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6656             else
6657                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6658         }
6659
6660         for(;;){
6661 //START_TIMER
6662             int ret = decode_mb_cabac(h);
6663             int eos;
6664 //STOP_TIMER("decode_mb_cabac")
6665
6666             if(ret>=0) hl_decode_mb(h);
6667
6668             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6669                 s->mb_y++;
6670
6671                 if(ret>=0) ret = decode_mb_cabac(h);
6672
6673                 if(ret>=0) hl_decode_mb(h);
6674                 s->mb_y--;
6675             }
6676             eos = get_cabac_terminate( &h->cabac );
6677
6678             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6679                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6680                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6681                 return -1;
6682             }
6683
6684             if( ++s->mb_x >= s->mb_width ) {
6685                 s->mb_x = 0;
6686                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6687                 ++s->mb_y;
6688                 if(FIELD_OR_MBAFF_PICTURE) {
6689                     ++s->mb_y;
6690                 }
6691             }
6692
6693             if( eos || s->mb_y >= s->mb_height ) {
6694                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6695                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6696                 return 0;
6697             }
6698         }
6699
6700     } else {
6701         for(;;){
6702             int ret = decode_mb_cavlc(h);
6703
6704             if(ret>=0) hl_decode_mb(h);
6705
6706             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6707                 s->mb_y++;
6708                 ret = decode_mb_cavlc(h);
6709
6710                 if(ret>=0) hl_decode_mb(h);
6711                 s->mb_y--;
6712             }
6713
6714             if(ret<0){
6715                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6716                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6717
6718                 return -1;
6719             }
6720
6721             if(++s->mb_x >= s->mb_width){
6722                 s->mb_x=0;
6723                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6724                 ++s->mb_y;
6725                 if(FIELD_OR_MBAFF_PICTURE) {
6726                     ++s->mb_y;
6727                 }
6728                 if(s->mb_y >= s->mb_height){
6729                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6730
6731                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6732                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6733
6734                         return 0;
6735                     }else{
6736                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6737
6738                         return -1;
6739                     }
6740                 }
6741             }
6742
6743             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6744                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6745                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6746                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6747
6748                     return 0;
6749                 }else{
6750                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6751
6752                     return -1;
6753                 }
6754             }
6755         }
6756     }
6757
6758 #if 0
6759     for(;s->mb_y < s->mb_height; s->mb_y++){
6760         for(;s->mb_x < s->mb_width; s->mb_x++){
6761             int ret= decode_mb(h);
6762
6763             hl_decode_mb(h);
6764
6765             if(ret<0){
6766                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6767                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6768
6769                 return -1;
6770             }
6771
6772             if(++s->mb_x >= s->mb_width){
6773                 s->mb_x=0;
6774                 if(++s->mb_y >= s->mb_height){
6775                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6776                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6777
6778                         return 0;
6779                     }else{
6780                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6781
6782                         return -1;
6783                     }
6784                 }
6785             }
6786
6787             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6788                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6789                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6790
6791                     return 0;
6792                 }else{
6793                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6794
6795                     return -1;
6796                 }
6797             }
6798         }
6799         s->mb_x=0;
6800         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6801     }
6802 #endif
6803     return -1; //not reached
6804 }
6805
6806 static int decode_picture_timing(H264Context *h){
6807     MpegEncContext * const s = &h->s;
6808     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6809         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6810         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6811     }
6812     if(h->sps.pic_struct_present_flag){
6813         unsigned int i, num_clock_ts;
6814         h->sei_pic_struct = get_bits(&s->gb, 4);
6815
6816         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6817             return -1;
6818
6819         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6820
6821         for (i = 0 ; i < num_clock_ts ; i++){
6822             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6823                 unsigned int full_timestamp_flag;
6824                 skip_bits(&s->gb, 2);                 /* ct_type */
6825                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6826                 skip_bits(&s->gb, 5);                 /* counting_type */
6827                 full_timestamp_flag = get_bits(&s->gb, 1);
6828                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6829                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6830                 skip_bits(&s->gb, 8);                 /* n_frames */
6831                 if(full_timestamp_flag){
6832                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6833                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6834                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6835                 }else{
6836                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6837                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6838                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6839                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6840                             if(get_bits(&s->gb, 1))   /* hours_flag */
6841                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6842                         }
6843                     }
6844                 }
6845                 if(h->sps.time_offset_length > 0)
6846                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6847             }
6848         }
6849     }
6850     return 0;
6851 }
6852
6853 static int decode_unregistered_user_data(H264Context *h, int size){
6854     MpegEncContext * const s = &h->s;
6855     uint8_t user_data[16+256];
6856     int e, build, i;
6857
6858     if(size<16)
6859         return -1;
6860
6861     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6862         user_data[i]= get_bits(&s->gb, 8);
6863     }
6864
6865     user_data[i]= 0;
6866     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6867     if(e==1 && build>=0)
6868         h->x264_build= build;
6869
6870     if(s->avctx->debug & FF_DEBUG_BUGS)
6871         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6872
6873     for(; i<size; i++)
6874         skip_bits(&s->gb, 8);
6875
6876     return 0;
6877 }
6878
6879 static int decode_sei(H264Context *h){
6880     MpegEncContext * const s = &h->s;
6881
6882     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6883         int size, type;
6884
6885         type=0;
6886         do{
6887             type+= show_bits(&s->gb, 8);
6888         }while(get_bits(&s->gb, 8) == 255);
6889
6890         size=0;
6891         do{
6892             size+= show_bits(&s->gb, 8);
6893         }while(get_bits(&s->gb, 8) == 255);
6894
6895         switch(type){
6896         case 1: // Picture timing SEI
6897             if(decode_picture_timing(h) < 0)
6898                 return -1;
6899             break;
6900         case 5:
6901             if(decode_unregistered_user_data(h, size) < 0)
6902                 return -1;
6903             break;
6904         default:
6905             skip_bits(&s->gb, 8*size);
6906         }
6907
6908         //FIXME check bits here
6909         align_get_bits(&s->gb);
6910     }
6911
6912     return 0;
6913 }
6914
6915 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6916     MpegEncContext * const s = &h->s;
6917     int cpb_count, i;
6918     cpb_count = get_ue_golomb(&s->gb) + 1;
6919     get_bits(&s->gb, 4); /* bit_rate_scale */
6920     get_bits(&s->gb, 4); /* cpb_size_scale */
6921     for(i=0; i<cpb_count; i++){
6922         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6923         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6924         get_bits1(&s->gb);     /* cbr_flag */
6925     }
6926     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6927     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6928     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6929     sps->time_offset_length = get_bits(&s->gb, 5);
6930 }
6931
6932 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6933     MpegEncContext * const s = &h->s;
6934     int aspect_ratio_info_present_flag;
6935     unsigned int aspect_ratio_idc;
6936
6937     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6938
6939     if( aspect_ratio_info_present_flag ) {
6940         aspect_ratio_idc= get_bits(&s->gb, 8);
6941         if( aspect_ratio_idc == EXTENDED_SAR ) {
6942             sps->sar.num= get_bits(&s->gb, 16);
6943             sps->sar.den= get_bits(&s->gb, 16);
6944         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6945             sps->sar=  pixel_aspect[aspect_ratio_idc];
6946         }else{
6947             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6948             return -1;
6949         }
6950     }else{
6951         sps->sar.num=
6952         sps->sar.den= 0;
6953     }
6954 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6955
6956     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6957         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6958     }
6959
6960     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6961         get_bits(&s->gb, 3);    /* video_format */
6962         get_bits1(&s->gb);      /* video_full_range_flag */
6963         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6964             get_bits(&s->gb, 8); /* colour_primaries */
6965             get_bits(&s->gb, 8); /* transfer_characteristics */
6966             get_bits(&s->gb, 8); /* matrix_coefficients */
6967         }
6968     }
6969
6970     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6971         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6972         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6973     }
6974
6975     sps->timing_info_present_flag = get_bits1(&s->gb);
6976     if(sps->timing_info_present_flag){
6977         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6978         sps->time_scale = get_bits_long(&s->gb, 32);
6979         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6980     }
6981
6982     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6983     if(sps->nal_hrd_parameters_present_flag)
6984         decode_hrd_parameters(h, sps);
6985     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6986     if(sps->vcl_hrd_parameters_present_flag)
6987         decode_hrd_parameters(h, sps);
6988     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6989         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6990     sps->pic_struct_present_flag = get_bits1(&s->gb);
6991
6992     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6993     if(sps->bitstream_restriction_flag){
6994         unsigned int num_reorder_frames;
6995         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6996         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6997         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6998         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6999         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7000         num_reorder_frames= get_ue_golomb(&s->gb);
7001         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7002
7003         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7004             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7005             return -1;
7006         }
7007
7008         sps->num_reorder_frames= num_reorder_frames;
7009     }
7010
7011     return 0;
7012 }
7013
7014 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7015                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7016     MpegEncContext * const s = &h->s;
7017     int i, last = 8, next = 8;
7018     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7019     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7020         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7021     else
7022     for(i=0;i<size;i++){
7023         if(next)
7024             next = (last + get_se_golomb(&s->gb)) & 0xff;
7025         if(!i && !next){ /* matrix not written, we use the preset one */
7026             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7027             break;
7028         }
7029         last = factors[scan[i]] = next ? next : last;
7030     }
7031 }
7032
7033 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7034                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7035     MpegEncContext * const s = &h->s;
7036     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7037     const uint8_t *fallback[4] = {
7038         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7039         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7040         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7041         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7042     };
7043     if(get_bits1(&s->gb)){
7044         sps->scaling_matrix_present |= is_sps;
7045         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7046         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7047         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7048         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7049         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7050         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7051         if(is_sps || pps->transform_8x8_mode){
7052             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7053             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7054         }
7055     }
7056 }
7057
7058 /**
7059  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7060  */
7061 static void *
7062 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7063                     const size_t size, const char *name)
7064 {
7065     if(id>=max) {
7066         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7067         return NULL;
7068     }
7069
7070     if(!vec[id]) {
7071         vec[id] = av_mallocz(size);
7072         if(vec[id] == NULL)
7073             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7074     }
7075     return vec[id];
7076 }
7077
7078 static inline int decode_seq_parameter_set(H264Context *h){
7079     MpegEncContext * const s = &h->s;
7080     int profile_idc, level_idc;
7081     unsigned int sps_id, tmp, mb_width, mb_height;
7082     int i;
7083     SPS *sps;
7084
7085     profile_idc= get_bits(&s->gb, 8);
7086     get_bits1(&s->gb);   //constraint_set0_flag
7087     get_bits1(&s->gb);   //constraint_set1_flag
7088     get_bits1(&s->gb);   //constraint_set2_flag
7089     get_bits1(&s->gb);   //constraint_set3_flag
7090     get_bits(&s->gb, 4); // reserved
7091     level_idc= get_bits(&s->gb, 8);
7092     sps_id= get_ue_golomb(&s->gb);
7093
7094     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7095     if(sps == NULL)
7096         return -1;
7097
7098     sps->profile_idc= profile_idc;
7099     sps->level_idc= level_idc;
7100
7101     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7102     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7103     sps->scaling_matrix_present = 0;
7104
7105     if(sps->profile_idc >= 100){ //high profile
7106         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7107         if(sps->chroma_format_idc == 3)
7108             get_bits1(&s->gb);  //residual_color_transform_flag
7109         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7110         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7111         sps->transform_bypass = get_bits1(&s->gb);
7112         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7113     }else{
7114         sps->chroma_format_idc= 1;
7115     }
7116
7117     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7118     sps->poc_type= get_ue_golomb(&s->gb);
7119
7120     if(sps->poc_type == 0){ //FIXME #define
7121         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7122     } else if(sps->poc_type == 1){//FIXME #define
7123         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7124         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7125         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7126         tmp= get_ue_golomb(&s->gb);
7127
7128         if(tmp >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7129             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7130             return -1;
7131         }
7132         sps->poc_cycle_length= tmp;
7133
7134         for(i=0; i<sps->poc_cycle_length; i++)
7135             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7136     }else if(sps->poc_type != 2){
7137         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7138         return -1;
7139     }
7140
7141     tmp= get_ue_golomb(&s->gb);
7142     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7143         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7144         return -1;
7145     }
7146     sps->ref_frame_count= tmp;
7147     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7148     mb_width= get_ue_golomb(&s->gb) + 1;
7149     mb_height= get_ue_golomb(&s->gb) + 1;
7150     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7151        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7152         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7153         return -1;
7154     }
7155     sps->mb_width = mb_width;
7156     sps->mb_height= mb_height;
7157
7158     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7159     if(!sps->frame_mbs_only_flag)
7160         sps->mb_aff= get_bits1(&s->gb);
7161     else
7162         sps->mb_aff= 0;
7163
7164     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7165
7166 #ifndef ALLOW_INTERLACE
7167     if(sps->mb_aff)
7168         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7169 #endif
7170     sps->crop= get_bits1(&s->gb);
7171     if(sps->crop){
7172         sps->crop_left  = get_ue_golomb(&s->gb);
7173         sps->crop_right = get_ue_golomb(&s->gb);
7174         sps->crop_top   = get_ue_golomb(&s->gb);
7175         sps->crop_bottom= get_ue_golomb(&s->gb);
7176         if(sps->crop_left || sps->crop_top){
7177             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7178         }
7179         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7180             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7181         }
7182     }else{
7183         sps->crop_left  =
7184         sps->crop_right =
7185         sps->crop_top   =
7186         sps->crop_bottom= 0;
7187     }
7188
7189     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7190     if( sps->vui_parameters_present_flag )
7191         decode_vui_parameters(h, sps);
7192
7193     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7194         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7195                sps_id, sps->profile_idc, sps->level_idc,
7196                sps->poc_type,
7197                sps->ref_frame_count,
7198                sps->mb_width, sps->mb_height,
7199                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7200                sps->direct_8x8_inference_flag ? "8B8" : "",
7201                sps->crop_left, sps->crop_right,
7202                sps->crop_top, sps->crop_bottom,
7203                sps->vui_parameters_present_flag ? "VUI" : "",
7204                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7205                );
7206     }
7207     return 0;
7208 }
7209
7210 static void
7211 build_qp_table(PPS *pps, int t, int index)
7212 {
7213     int i;
7214     for(i = 0; i < 52; i++)
7215         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7216 }
7217
7218 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7219     MpegEncContext * const s = &h->s;
7220     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7221     PPS *pps;
7222
7223     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7224     if(pps == NULL)
7225         return -1;
7226
7227     tmp= get_ue_golomb(&s->gb);
7228     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7229         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7230         return -1;
7231     }
7232     pps->sps_id= tmp;
7233
7234     pps->cabac= get_bits1(&s->gb);
7235     pps->pic_order_present= get_bits1(&s->gb);
7236     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7237     if(pps->slice_group_count > 1 ){
7238         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7239         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7240         switch(pps->mb_slice_group_map_type){
7241         case 0:
7242 #if 0
7243 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7244 |    run_length[ i ]                                |1  |ue(v)   |
7245 #endif
7246             break;
7247         case 2:
7248 #if 0
7249 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7250 |{                                                  |   |        |
7251 |    top_left_mb[ i ]                               |1  |ue(v)   |
7252 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7253 |   }                                               |   |        |
7254 #endif
7255             break;
7256         case 3:
7257         case 4:
7258         case 5:
7259 #if 0
7260 |   slice_group_change_direction_flag               |1  |u(1)    |
7261 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7262 #endif
7263             break;
7264         case 6:
7265 #if 0
7266 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7267 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7268 |)                                                  |   |        |
7269 |    slice_group_id[ i ]                            |1  |u(v)    |
7270 #endif
7271             break;
7272         }
7273     }
7274     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7275     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7276     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7277         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7278         pps->ref_count[0]= pps->ref_count[1]= 1;
7279         return -1;
7280     }
7281
7282     pps->weighted_pred= get_bits1(&s->gb);
7283     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7284     pps->init_qp= get_se_golomb(&s->gb) + 26;
7285     pps->init_qs= get_se_golomb(&s->gb) + 26;
7286     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7287     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7288     pps->constrained_intra_pred= get_bits1(&s->gb);
7289     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7290
7291     pps->transform_8x8_mode= 0;
7292     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7293     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7294     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7295
7296     if(get_bits_count(&s->gb) < bit_length){
7297         pps->transform_8x8_mode= get_bits1(&s->gb);
7298         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7299         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7300     } else {
7301         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7302     }
7303
7304     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7305     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7306     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7307         h->pps.chroma_qp_diff= 1;
7308
7309     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7310         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7311                pps_id, pps->sps_id,
7312                pps->cabac ? "CABAC" : "CAVLC",
7313                pps->slice_group_count,
7314                pps->ref_count[0], pps->ref_count[1],
7315                pps->weighted_pred ? "weighted" : "",
7316                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7317                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7318                pps->constrained_intra_pred ? "CONSTR" : "",
7319                pps->redundant_pic_cnt_present ? "REDU" : "",
7320                pps->transform_8x8_mode ? "8x8DCT" : ""
7321                );
7322     }
7323
7324     return 0;
7325 }
7326
7327 /**
7328  * Call decode_slice() for each context.
7329  *
7330  * @param h h264 master context
7331  * @param context_count number of contexts to execute
7332  */
7333 static void execute_decode_slices(H264Context *h, int context_count){
7334     MpegEncContext * const s = &h->s;
7335     AVCodecContext * const avctx= s->avctx;
7336     H264Context *hx;
7337     int i;
7338
7339     if(context_count == 1) {
7340         decode_slice(avctx, &h);
7341     } else {
7342         for(i = 1; i < context_count; i++) {
7343             hx = h->thread_context[i];
7344             hx->s.error_recognition = avctx->error_recognition;
7345             hx->s.error_count = 0;
7346         }
7347
7348         avctx->execute(avctx, (void *)decode_slice,
7349                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7350
7351         /* pull back stuff from slices to master context */
7352         hx = h->thread_context[context_count - 1];
7353         s->mb_x = hx->s.mb_x;
7354         s->mb_y = hx->s.mb_y;
7355         s->dropable = hx->s.dropable;
7356         s->picture_structure = hx->s.picture_structure;
7357         for(i = 1; i < context_count; i++)
7358             h->s.error_count += h->thread_context[i]->s.error_count;
7359     }
7360 }
7361
7362
7363 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7364     MpegEncContext * const s = &h->s;
7365     AVCodecContext * const avctx= s->avctx;
7366     int buf_index=0;
7367     H264Context *hx; ///< thread context
7368     int context_count = 0;
7369
7370     h->max_contexts = avctx->thread_count;
7371 #if 0
7372     int i;
7373     for(i=0; i<50; i++){
7374         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7375     }
7376 #endif
7377     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7378         h->current_slice = 0;
7379         if (!s->first_field)
7380             s->current_picture_ptr= NULL;
7381     }
7382
7383     for(;;){
7384         int consumed;
7385         int dst_length;
7386         int bit_length;
7387         const uint8_t *ptr;
7388         int i, nalsize = 0;
7389         int err;
7390
7391         if(h->is_avc) {
7392             if(buf_index >= buf_size) break;
7393             nalsize = 0;
7394             for(i = 0; i < h->nal_length_size; i++)
7395                 nalsize = (nalsize << 8) | buf[buf_index++];
7396             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7397                 if(nalsize == 1){
7398                     buf_index++;
7399                     continue;
7400                 }else{
7401                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7402                     break;
7403                 }
7404             }
7405         } else {
7406             // start code prefix search
7407             for(; buf_index + 3 < buf_size; buf_index++){
7408                 // This should always succeed in the first iteration.
7409                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7410                     break;
7411             }
7412
7413             if(buf_index+3 >= buf_size) break;
7414
7415             buf_index+=3;
7416         }
7417
7418         hx = h->thread_context[context_count];
7419
7420         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7421         if (ptr==NULL || dst_length < 0){
7422             return -1;
7423         }
7424         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7425             dst_length--;
7426         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7427
7428         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7429             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7430         }
7431
7432         if (h->is_avc && (nalsize != consumed)){
7433             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7434             consumed= nalsize;
7435         }
7436
7437         buf_index += consumed;
7438
7439         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7440            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7441             continue;
7442
7443       again:
7444         err = 0;
7445         switch(hx->nal_unit_type){
7446         case NAL_IDR_SLICE:
7447             if (h->nal_unit_type != NAL_IDR_SLICE) {
7448                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7449                 return -1;
7450             }
7451             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7452         case NAL_SLICE:
7453             init_get_bits(&hx->s.gb, ptr, bit_length);
7454             hx->intra_gb_ptr=
7455             hx->inter_gb_ptr= &hx->s.gb;
7456             hx->s.data_partitioning = 0;
7457
7458             if((err = decode_slice_header(hx, h)))
7459                break;
7460
7461             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7462             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7463                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7464                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7465                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7466                && avctx->skip_frame < AVDISCARD_ALL)
7467                 context_count++;
7468             break;
7469         case NAL_DPA:
7470             init_get_bits(&hx->s.gb, ptr, bit_length);
7471             hx->intra_gb_ptr=
7472             hx->inter_gb_ptr= NULL;
7473             hx->s.data_partitioning = 1;
7474
7475             err = decode_slice_header(hx, h);
7476             break;
7477         case NAL_DPB:
7478             init_get_bits(&hx->intra_gb, ptr, bit_length);
7479             hx->intra_gb_ptr= &hx->intra_gb;
7480             break;
7481         case NAL_DPC:
7482             init_get_bits(&hx->inter_gb, ptr, bit_length);
7483             hx->inter_gb_ptr= &hx->inter_gb;
7484
7485             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7486                && s->context_initialized
7487                && s->hurry_up < 5
7488                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7489                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7490                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7491                && avctx->skip_frame < AVDISCARD_ALL)
7492                 context_count++;
7493             break;
7494         case NAL_SEI:
7495             init_get_bits(&s->gb, ptr, bit_length);
7496             decode_sei(h);
7497             break;
7498         case NAL_SPS:
7499             init_get_bits(&s->gb, ptr, bit_length);
7500             decode_seq_parameter_set(h);
7501
7502             if(s->flags& CODEC_FLAG_LOW_DELAY)
7503                 s->low_delay=1;
7504
7505             if(avctx->has_b_frames < 2)
7506                 avctx->has_b_frames= !s->low_delay;
7507             break;
7508         case NAL_PPS:
7509             init_get_bits(&s->gb, ptr, bit_length);
7510
7511             decode_picture_parameter_set(h, bit_length);
7512
7513             break;
7514         case NAL_AUD:
7515         case NAL_END_SEQUENCE:
7516         case NAL_END_STREAM:
7517         case NAL_FILLER_DATA:
7518         case NAL_SPS_EXT:
7519         case NAL_AUXILIARY_SLICE:
7520             break;
7521         default:
7522             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7523         }
7524
7525         if(context_count == h->max_contexts) {
7526             execute_decode_slices(h, context_count);
7527             context_count = 0;
7528         }
7529
7530         if (err < 0)
7531             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7532         else if(err == 1) {
7533             /* Slice could not be decoded in parallel mode, copy down
7534              * NAL unit stuff to context 0 and restart. Note that
7535              * rbsp_buffer is not transferred, but since we no longer
7536              * run in parallel mode this should not be an issue. */
7537             h->nal_unit_type = hx->nal_unit_type;
7538             h->nal_ref_idc   = hx->nal_ref_idc;
7539             hx = h;
7540             goto again;
7541         }
7542     }
7543     if(context_count)
7544         execute_decode_slices(h, context_count);
7545     return buf_index;
7546 }
7547
7548 /**
7549  * returns the number of bytes consumed for building the current frame
7550  */
7551 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7552         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7553         if(pos+10>buf_size) pos=buf_size; // oops ;)
7554
7555         return pos;
7556 }
7557
7558 static int decode_frame(AVCodecContext *avctx,
7559                              void *data, int *data_size,
7560                              const uint8_t *buf, int buf_size)
7561 {
7562     H264Context *h = avctx->priv_data;
7563     MpegEncContext *s = &h->s;
7564     AVFrame *pict = data;
7565     int buf_index;
7566
7567     s->flags= avctx->flags;
7568     s->flags2= avctx->flags2;
7569
7570    /* end of stream, output what is still in the buffers */
7571     if (buf_size == 0) {
7572         Picture *out;
7573         int i, out_idx;
7574
7575 //FIXME factorize this with the output code below
7576         out = h->delayed_pic[0];
7577         out_idx = 0;
7578         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7579             if(h->delayed_pic[i]->poc < out->poc){
7580                 out = h->delayed_pic[i];
7581                 out_idx = i;
7582             }
7583
7584         for(i=out_idx; h->delayed_pic[i]; i++)
7585             h->delayed_pic[i] = h->delayed_pic[i+1];
7586
7587         if(out){
7588             *data_size = sizeof(AVFrame);
7589             *pict= *(AVFrame*)out;
7590         }
7591
7592         return 0;
7593     }
7594
7595     if(h->is_avc && !h->got_avcC) {
7596         int i, cnt, nalsize;
7597         unsigned char *p = avctx->extradata;
7598         if(avctx->extradata_size < 7) {
7599             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7600             return -1;
7601         }
7602         if(*p != 1) {
7603             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7604             return -1;
7605         }
7606         /* sps and pps in the avcC always have length coded with 2 bytes,
7607            so put a fake nal_length_size = 2 while parsing them */
7608         h->nal_length_size = 2;
7609         // Decode sps from avcC
7610         cnt = *(p+5) & 0x1f; // Number of sps
7611         p += 6;
7612         for (i = 0; i < cnt; i++) {
7613             nalsize = AV_RB16(p) + 2;
7614             if(decode_nal_units(h, p, nalsize) < 0) {
7615                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7616                 return -1;
7617             }
7618             p += nalsize;
7619         }
7620         // Decode pps from avcC
7621         cnt = *(p++); // Number of pps
7622         for (i = 0; i < cnt; i++) {
7623             nalsize = AV_RB16(p) + 2;
7624             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7625                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7626                 return -1;
7627             }
7628             p += nalsize;
7629         }
7630         // Now store right nal length size, that will be use to parse all other nals
7631         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7632         // Do not reparse avcC
7633         h->got_avcC = 1;
7634     }
7635
7636     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7637         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7638             return -1;
7639         h->got_avcC = 1;
7640     }
7641
7642     buf_index=decode_nal_units(h, buf, buf_size);
7643     if(buf_index < 0)
7644         return -1;
7645
7646     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7647         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7648         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7649         return -1;
7650     }
7651
7652     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7653         Picture *out = s->current_picture_ptr;
7654         Picture *cur = s->current_picture_ptr;
7655         int i, pics, cross_idr, out_of_order, out_idx;
7656
7657         s->mb_y= 0;
7658
7659         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7660         s->current_picture_ptr->pict_type= s->pict_type;
7661
7662         if(!s->dropable) {
7663             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7664             h->prev_poc_msb= h->poc_msb;
7665             h->prev_poc_lsb= h->poc_lsb;
7666         }
7667         h->prev_frame_num_offset= h->frame_num_offset;
7668         h->prev_frame_num= h->frame_num;
7669
7670         /*
7671          * FIXME: Error handling code does not seem to support interlaced
7672          * when slices span multiple rows
7673          * The ff_er_add_slice calls don't work right for bottom
7674          * fields; they cause massive erroneous error concealing
7675          * Error marking covers both fields (top and bottom).
7676          * This causes a mismatched s->error_count
7677          * and a bad error table. Further, the error count goes to
7678          * INT_MAX when called for bottom field, because mb_y is
7679          * past end by one (callers fault) and resync_mb_y != 0
7680          * causes problems for the first MB line, too.
7681          */
7682         if (!FIELD_PICTURE)
7683             ff_er_frame_end(s);
7684
7685         MPV_frame_end(s);
7686
7687         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7688             /* Wait for second field. */
7689             *data_size = 0;
7690
7691         } else {
7692             cur->repeat_pict = 0;
7693
7694             /* Signal interlacing information externally. */
7695             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7696             if(h->sps.pic_struct_present_flag){
7697                 switch (h->sei_pic_struct)
7698                 {
7699                 case SEI_PIC_STRUCT_FRAME:
7700                     cur->interlaced_frame = 0;
7701                     break;
7702                 case SEI_PIC_STRUCT_TOP_FIELD:
7703                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7704                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7705                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7706                     cur->interlaced_frame = 1;
7707                     break;
7708                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7709                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7710                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7711                     // From these hints, let the applications decide if they apply deinterlacing.
7712                     cur->repeat_pict = 1;
7713                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7714                     break;
7715                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7716                     // Force progressive here, as doubling interlaced frame is a bad idea.
7717                     cur->interlaced_frame = 0;
7718                     cur->repeat_pict = 2;
7719                     break;
7720                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7721                     cur->interlaced_frame = 0;
7722                     cur->repeat_pict = 4;
7723                     break;
7724                 }
7725             }else{
7726                 /* Derive interlacing flag from used decoding process. */
7727                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7728             }
7729
7730             if (cur->field_poc[0] != cur->field_poc[1]){
7731                 /* Derive top_field_first from field pocs. */
7732                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7733             }else{
7734                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7735                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7736                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7737                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7738                         cur->top_field_first = 1;
7739                     else
7740                         cur->top_field_first = 0;
7741                 }else{
7742                     /* Most likely progressive */
7743                     cur->top_field_first = 0;
7744                 }
7745             }
7746
7747         //FIXME do something with unavailable reference frames
7748
7749             /* Sort B-frames into display order */
7750
7751             if(h->sps.bitstream_restriction_flag
7752                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7753                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7754                 s->low_delay = 0;
7755             }
7756
7757             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7758                && !h->sps.bitstream_restriction_flag){
7759                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7760                 s->low_delay= 0;
7761             }
7762
7763             pics = 0;
7764             while(h->delayed_pic[pics]) pics++;
7765
7766             assert(pics <= MAX_DELAYED_PIC_COUNT);
7767
7768             h->delayed_pic[pics++] = cur;
7769             if(cur->reference == 0)
7770                 cur->reference = DELAYED_PIC_REF;
7771
7772             out = h->delayed_pic[0];
7773             out_idx = 0;
7774             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7775                 if(h->delayed_pic[i]->poc < out->poc){
7776                     out = h->delayed_pic[i];
7777                     out_idx = i;
7778                 }
7779             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7780
7781             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7782
7783             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7784                 { }
7785             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7786                || (s->low_delay &&
7787                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7788                  || cur->pict_type == FF_B_TYPE)))
7789             {
7790                 s->low_delay = 0;
7791                 s->avctx->has_b_frames++;
7792             }
7793
7794             if(out_of_order || pics > s->avctx->has_b_frames){
7795                 out->reference &= ~DELAYED_PIC_REF;
7796                 for(i=out_idx; h->delayed_pic[i]; i++)
7797                     h->delayed_pic[i] = h->delayed_pic[i+1];
7798             }
7799             if(!out_of_order && pics > s->avctx->has_b_frames){
7800                 *data_size = sizeof(AVFrame);
7801
7802                 h->outputed_poc = out->poc;
7803                 *pict= *(AVFrame*)out;
7804             }else{
7805                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7806             }
7807         }
7808     }
7809
7810     assert(pict->data[0] || !*data_size);
7811     ff_print_debug_info(s, pict);
7812 //printf("out %d\n", (int)pict->data[0]);
7813 #if 0 //?
7814
7815     /* Return the Picture timestamp as the frame number */
7816     /* we subtract 1 because it is added on utils.c     */
7817     avctx->frame_number = s->picture_number - 1;
7818 #endif
7819     return get_consumed_bytes(s, buf_index, buf_size);
7820 }
7821 #if 0
7822 static inline void fill_mb_avail(H264Context *h){
7823     MpegEncContext * const s = &h->s;
7824     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7825
7826     if(s->mb_y){
7827         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7828         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7829         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7830     }else{
7831         h->mb_avail[0]=
7832         h->mb_avail[1]=
7833         h->mb_avail[2]= 0;
7834     }
7835     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7836     h->mb_avail[4]= 1; //FIXME move out
7837     h->mb_avail[5]= 0; //FIXME move out
7838 }
7839 #endif
7840
7841 #ifdef TEST
7842 #undef printf
7843 #undef random
7844 #define COUNT 8000
7845 #define SIZE (COUNT*40)
7846 int main(void){
7847     int i;
7848     uint8_t temp[SIZE];
7849     PutBitContext pb;
7850     GetBitContext gb;
7851 //    int int_temp[10000];
7852     DSPContext dsp;
7853     AVCodecContext avctx;
7854
7855     dsputil_init(&dsp, &avctx);
7856
7857     init_put_bits(&pb, temp, SIZE);
7858     printf("testing unsigned exp golomb\n");
7859     for(i=0; i<COUNT; i++){
7860         START_TIMER
7861         set_ue_golomb(&pb, i);
7862         STOP_TIMER("set_ue_golomb");
7863     }
7864     flush_put_bits(&pb);
7865
7866     init_get_bits(&gb, temp, 8*SIZE);
7867     for(i=0; i<COUNT; i++){
7868         int j, s;
7869
7870         s= show_bits(&gb, 24);
7871
7872         START_TIMER
7873         j= get_ue_golomb(&gb);
7874         if(j != i){
7875             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7876 //            return -1;
7877         }
7878         STOP_TIMER("get_ue_golomb");
7879     }
7880
7881
7882     init_put_bits(&pb, temp, SIZE);
7883     printf("testing signed exp golomb\n");
7884     for(i=0; i<COUNT; i++){
7885         START_TIMER
7886         set_se_golomb(&pb, i - COUNT/2);
7887         STOP_TIMER("set_se_golomb");
7888     }
7889     flush_put_bits(&pb);
7890
7891     init_get_bits(&gb, temp, 8*SIZE);
7892     for(i=0; i<COUNT; i++){
7893         int j, s;
7894
7895         s= show_bits(&gb, 24);
7896
7897         START_TIMER
7898         j= get_se_golomb(&gb);
7899         if(j != i - COUNT/2){
7900             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7901 //            return -1;
7902         }
7903         STOP_TIMER("get_se_golomb");
7904     }
7905
7906 #if 0
7907     printf("testing 4x4 (I)DCT\n");
7908
7909     DCTELEM block[16];
7910     uint8_t src[16], ref[16];
7911     uint64_t error= 0, max_error=0;
7912
7913     for(i=0; i<COUNT; i++){
7914         int j;
7915 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7916         for(j=0; j<16; j++){
7917             ref[j]= random()%255;
7918             src[j]= random()%255;
7919         }
7920
7921         h264_diff_dct_c(block, src, ref, 4);
7922
7923         //normalize
7924         for(j=0; j<16; j++){
7925 //            printf("%d ", block[j]);
7926             block[j]= block[j]*4;
7927             if(j&1) block[j]= (block[j]*4 + 2)/5;
7928             if(j&4) block[j]= (block[j]*4 + 2)/5;
7929         }
7930 //        printf("\n");
7931
7932         s->dsp.h264_idct_add(ref, block, 4);
7933 /*        for(j=0; j<16; j++){
7934             printf("%d ", ref[j]);
7935         }
7936         printf("\n");*/
7937
7938         for(j=0; j<16; j++){
7939             int diff= FFABS(src[j] - ref[j]);
7940
7941             error+= diff*diff;
7942             max_error= FFMAX(max_error, diff);
7943         }
7944     }
7945     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7946     printf("testing quantizer\n");
7947     for(qp=0; qp<52; qp++){
7948         for(i=0; i<16; i++)
7949             src1_block[i]= src2_block[i]= random()%255;
7950
7951     }
7952     printf("Testing NAL layer\n");
7953
7954     uint8_t bitstream[COUNT];
7955     uint8_t nal[COUNT*2];
7956     H264Context h;
7957     memset(&h, 0, sizeof(H264Context));
7958
7959     for(i=0; i<COUNT; i++){
7960         int zeros= i;
7961         int nal_length;
7962         int consumed;
7963         int out_length;
7964         uint8_t *out;
7965         int j;
7966
7967         for(j=0; j<COUNT; j++){
7968             bitstream[j]= (random() % 255) + 1;
7969         }
7970
7971         for(j=0; j<zeros; j++){
7972             int pos= random() % COUNT;
7973             while(bitstream[pos] == 0){
7974                 pos++;
7975                 pos %= COUNT;
7976             }
7977             bitstream[pos]=0;
7978         }
7979
7980         START_TIMER
7981
7982         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7983         if(nal_length<0){
7984             printf("encoding failed\n");
7985             return -1;
7986         }
7987
7988         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7989
7990         STOP_TIMER("NAL")
7991
7992         if(out_length != COUNT){
7993             printf("incorrect length %d %d\n", out_length, COUNT);
7994             return -1;
7995         }
7996
7997         if(consumed != nal_length){
7998             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7999             return -1;
8000         }
8001
8002         if(memcmp(bitstream, out, COUNT)){
8003             printf("mismatch\n");
8004             return -1;
8005         }
8006     }
8007 #endif
8008
8009     printf("Testing RBSP\n");
8010
8011
8012     return 0;
8013 }
8014 #endif /* TEST */
8015
8016
8017 static av_cold int decode_end(AVCodecContext *avctx)
8018 {
8019     H264Context *h = avctx->priv_data;
8020     MpegEncContext *s = &h->s;
8021     int i;
8022
8023     av_freep(&h->rbsp_buffer[0]);
8024     av_freep(&h->rbsp_buffer[1]);
8025     free_tables(h); //FIXME cleanup init stuff perhaps
8026
8027     for(i = 0; i < MAX_SPS_COUNT; i++)
8028         av_freep(h->sps_buffers + i);
8029
8030     for(i = 0; i < MAX_PPS_COUNT; i++)
8031         av_freep(h->pps_buffers + i);
8032
8033     MPV_common_end(s);
8034
8035 //    memset(h, 0, sizeof(H264Context));
8036
8037     return 0;
8038 }
8039
8040
8041 AVCodec h264_decoder = {
8042     "h264",
8043     CODEC_TYPE_VIDEO,
8044     CODEC_ID_H264,
8045     sizeof(H264Context),
8046     decode_init,
8047     NULL,
8048     decode_end,
8049     decode_frame,
8050     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8051     .flush= flush_dpb,
8052     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8053 };
8054
8055 #include "svq3.c"