libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1591 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1592 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1593     int i;
1594     const int * const quant_table= quant_coeff[qscale];
1595     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1596     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1597     const unsigned int threshold2= (threshold1<<1);
1598     int last_non_zero;
1599
1600     if(separate_dc){
1601         if(qscale<=18){
1602             //avoid overflows
1603             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1604             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1605             const unsigned int dc_threshold2= (dc_threshold1<<1);
1606
1607             int level= block[0]*quant_coeff[qscale+18][0];
1608             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1609                 if(level>0){
1610                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1611                     block[0]= level;
1612                 }else{
1613                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1614                     block[0]= -level;
1615                 }
1616 //                last_non_zero = i;
1617             }else{
1618                 block[0]=0;
1619             }
1620         }else{
1621             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1622             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1623             const unsigned int dc_threshold2= (dc_threshold1<<1);
1624
1625             int level= block[0]*quant_table[0];
1626             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1627                 if(level>0){
1628                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1629                     block[0]= level;
1630                 }else{
1631                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1632                     block[0]= -level;
1633                 }
1634 //                last_non_zero = i;
1635             }else{
1636                 block[0]=0;
1637             }
1638         }
1639         last_non_zero= 0;
1640         i=1;
1641     }else{
1642         last_non_zero= -1;
1643         i=0;
1644     }
1645
1646     for(; i<16; i++){
1647         const int j= scantable[i];
1648         int level= block[j]*quant_table[j];
1649
1650 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1651 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1652         if(((unsigned)(level+threshold1))>threshold2){
1653             if(level>0){
1654                 level= (bias + level)>>QUANT_SHIFT;
1655                 block[j]= level;
1656             }else{
1657                 level= (bias - level)>>QUANT_SHIFT;
1658                 block[j]= -level;
1659             }
1660             last_non_zero = i;
1661         }else{
1662             block[j]=0;
1663         }
1664     }
1665
1666     return last_non_zero;
1667 }
1668
1669 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1670                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1671                            int src_x_offset, int src_y_offset,
1672                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1673     MpegEncContext * const s = &h->s;
1674     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1675     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1676     const int luma_xy= (mx&3) + ((my&3)<<2);
1677     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1678     uint8_t * src_cb, * src_cr;
1679     int extra_width= h->emu_edge_width;
1680     int extra_height= h->emu_edge_height;
1681     int emu=0;
1682     const int full_mx= mx>>2;
1683     const int full_my= my>>2;
1684     const int pic_width  = 16*s->mb_width;
1685     const int pic_height = 16*s->mb_height >> MB_FIELD;
1686
1687     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1688         return;
1689
1690     if(mx&7) extra_width -= 3;
1691     if(my&7) extra_height -= 3;
1692
1693     if(   full_mx < 0-extra_width
1694        || full_my < 0-extra_height
1695        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1696        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1697         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1698             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1699         emu=1;
1700     }
1701
1702     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1703     if(!square){
1704         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1705     }
1706
1707     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1708
1709     if(MB_FIELD){
1710         // chroma offset when predicting from a field of opposite parity
1711         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1712         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1713     }
1714     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1715     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1716
1717     if(emu){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1719             src_cb= s->edge_emu_buffer;
1720     }
1721     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1722
1723     if(emu){
1724         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1725             src_cr= s->edge_emu_buffer;
1726     }
1727     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1728 }
1729
1730 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1731                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1732                            int x_offset, int y_offset,
1733                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1734                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1735                            int list0, int list1){
1736     MpegEncContext * const s = &h->s;
1737     qpel_mc_func *qpix_op=  qpix_put;
1738     h264_chroma_mc_func chroma_op= chroma_put;
1739
1740     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1741     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1742     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1743     x_offset += 8*s->mb_x;
1744     y_offset += 8*(s->mb_y >> MB_FIELD);
1745
1746     if(list0){
1747         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1748         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1749                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1750                            qpix_op, chroma_op);
1751
1752         qpix_op=  qpix_avg;
1753         chroma_op= chroma_avg;
1754     }
1755
1756     if(list1){
1757         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1758         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1759                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1760                            qpix_op, chroma_op);
1761     }
1762 }
1763
1764 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1765                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1766                            int x_offset, int y_offset,
1767                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1768                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1769                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1770                            int list0, int list1){
1771     MpegEncContext * const s = &h->s;
1772
1773     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1774     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1775     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1776     x_offset += 8*s->mb_x;
1777     y_offset += 8*(s->mb_y >> MB_FIELD);
1778
1779     if(list0 && list1){
1780         /* don't optimize for luma-only case, since B-frames usually
1781          * use implicit weights => chroma too. */
1782         uint8_t *tmp_cb = s->obmc_scratchpad;
1783         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1784         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1785         int refn0 = h->ref_cache[0][ scan8[n] ];
1786         int refn1 = h->ref_cache[1][ scan8[n] ];
1787
1788         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1789                     dest_y, dest_cb, dest_cr,
1790                     x_offset, y_offset, qpix_put, chroma_put);
1791         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1792                     tmp_y, tmp_cb, tmp_cr,
1793                     x_offset, y_offset, qpix_put, chroma_put);
1794
1795         if(h->use_weight == 2){
1796             int weight0 = h->implicit_weight[refn0][refn1];
1797             int weight1 = 64 - weight0;
1798             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1799             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1800             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1801         }else{
1802             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1803                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1804                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1805             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1806                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1807                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1808             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1809                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1810                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1811         }
1812     }else{
1813         int list = list1 ? 1 : 0;
1814         int refn = h->ref_cache[list][ scan8[n] ];
1815         Picture *ref= &h->ref_list[list][refn];
1816         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1817                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1818                     qpix_put, chroma_put);
1819
1820         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1821                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1822         if(h->use_weight_chroma){
1823             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1824                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1825             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1826                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1827         }
1828     }
1829 }
1830
1831 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1832                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1833                            int x_offset, int y_offset,
1834                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1835                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1836                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1837                            int list0, int list1){
1838     if((h->use_weight==2 && list0 && list1
1839         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1840        || h->use_weight==1)
1841         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1842                          x_offset, y_offset, qpix_put, chroma_put,
1843                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1844     else
1845         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1846                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1847 }
1848
1849 static inline void prefetch_motion(H264Context *h, int list){
1850     /* fetch pixels for estimated mv 4 macroblocks ahead
1851      * optimized for 64byte cache lines */
1852     MpegEncContext * const s = &h->s;
1853     const int refn = h->ref_cache[list][scan8[0]];
1854     if(refn >= 0){
1855         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1856         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1857         uint8_t **src= h->ref_list[list][refn].data;
1858         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1859         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1860         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1861         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1862     }
1863 }
1864
1865 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1866                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1867                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1868                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1869     MpegEncContext * const s = &h->s;
1870     const int mb_xy= h->mb_xy;
1871     const int mb_type= s->current_picture.mb_type[mb_xy];
1872
1873     assert(IS_INTER(mb_type));
1874
1875     prefetch_motion(h, 0);
1876
1877     if(IS_16X16(mb_type)){
1878         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1879                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1880                 &weight_op[0], &weight_avg[0],
1881                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1882     }else if(IS_16X8(mb_type)){
1883         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1884                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1885                 &weight_op[1], &weight_avg[1],
1886                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1887         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1888                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1889                 &weight_op[1], &weight_avg[1],
1890                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1891     }else if(IS_8X16(mb_type)){
1892         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1893                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1894                 &weight_op[2], &weight_avg[2],
1895                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1896         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1897                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1898                 &weight_op[2], &weight_avg[2],
1899                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1900     }else{
1901         int i;
1902
1903         assert(IS_8X8(mb_type));
1904
1905         for(i=0; i<4; i++){
1906             const int sub_mb_type= h->sub_mb_type[i];
1907             const int n= 4*i;
1908             int x_offset= (i&1)<<2;
1909             int y_offset= (i&2)<<1;
1910
1911             if(IS_SUB_8X8(sub_mb_type)){
1912                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1913                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1914                     &weight_op[3], &weight_avg[3],
1915                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1916             }else if(IS_SUB_8X4(sub_mb_type)){
1917                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1918                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1919                     &weight_op[4], &weight_avg[4],
1920                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1921                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1922                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1923                     &weight_op[4], &weight_avg[4],
1924                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1925             }else if(IS_SUB_4X8(sub_mb_type)){
1926                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1927                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1928                     &weight_op[5], &weight_avg[5],
1929                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1930                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1931                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1932                     &weight_op[5], &weight_avg[5],
1933                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1934             }else{
1935                 int j;
1936                 assert(IS_SUB_4X4(sub_mb_type));
1937                 for(j=0; j<4; j++){
1938                     int sub_x_offset= x_offset + 2*(j&1);
1939                     int sub_y_offset= y_offset +   (j&2);
1940                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1941                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1942                         &weight_op[6], &weight_avg[6],
1943                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1944                 }
1945             }
1946         }
1947     }
1948
1949     prefetch_motion(h, 1);
1950 }
1951
1952 static av_cold void decode_init_vlc(void){
1953     static int done = 0;
1954
1955     if (!done) {
1956         int i;
1957         int offset;
1958         done = 1;
1959
1960         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1961         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1962         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1963                  &chroma_dc_coeff_token_len [0], 1, 1,
1964                  &chroma_dc_coeff_token_bits[0], 1, 1,
1965                  INIT_VLC_USE_NEW_STATIC);
1966
1967         offset = 0;
1968         for(i=0; i<4; i++){
1969             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1970             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1971             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1972                      &coeff_token_len [i][0], 1, 1,
1973                      &coeff_token_bits[i][0], 1, 1,
1974                      INIT_VLC_USE_NEW_STATIC);
1975             offset += coeff_token_vlc_tables_size[i];
1976         }
1977         /*
1978          * This is a one time safety check to make sure that
1979          * the packed static coeff_token_vlc table sizes
1980          * were initialized correctly.
1981          */
1982         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1983
1984         for(i=0; i<3; i++){
1985             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1986             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1987             init_vlc(&chroma_dc_total_zeros_vlc[i],
1988                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1989                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1990                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1991                      INIT_VLC_USE_NEW_STATIC);
1992         }
1993         for(i=0; i<15; i++){
1994             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1995             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1996             init_vlc(&total_zeros_vlc[i],
1997                      TOTAL_ZEROS_VLC_BITS, 16,
1998                      &total_zeros_len [i][0], 1, 1,
1999                      &total_zeros_bits[i][0], 1, 1,
2000                      INIT_VLC_USE_NEW_STATIC);
2001         }
2002
2003         for(i=0; i<6; i++){
2004             run_vlc[i].table = run_vlc_tables[i];
2005             run_vlc[i].table_allocated = run_vlc_tables_size;
2006             init_vlc(&run_vlc[i],
2007                      RUN_VLC_BITS, 7,
2008                      &run_len [i][0], 1, 1,
2009                      &run_bits[i][0], 1, 1,
2010                      INIT_VLC_USE_NEW_STATIC);
2011         }
2012         run7_vlc.table = run7_vlc_table,
2013         run7_vlc.table_allocated = run7_vlc_table_size;
2014         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2015                  &run_len [6][0], 1, 1,
2016                  &run_bits[6][0], 1, 1,
2017                  INIT_VLC_USE_NEW_STATIC);
2018     }
2019 }
2020
2021 static void free_tables(H264Context *h){
2022     int i;
2023     H264Context *hx;
2024     av_freep(&h->intra4x4_pred_mode);
2025     av_freep(&h->chroma_pred_mode_table);
2026     av_freep(&h->cbp_table);
2027     av_freep(&h->mvd_table[0]);
2028     av_freep(&h->mvd_table[1]);
2029     av_freep(&h->direct_table);
2030     av_freep(&h->non_zero_count);
2031     av_freep(&h->slice_table_base);
2032     h->slice_table= NULL;
2033
2034     av_freep(&h->mb2b_xy);
2035     av_freep(&h->mb2b8_xy);
2036
2037     for(i = 0; i < h->s.avctx->thread_count; i++) {
2038         hx = h->thread_context[i];
2039         if(!hx) continue;
2040         av_freep(&hx->top_borders[1]);
2041         av_freep(&hx->top_borders[0]);
2042         av_freep(&hx->s.obmc_scratchpad);
2043     }
2044 }
2045
2046 static void init_dequant8_coeff_table(H264Context *h){
2047     int i,q,x;
2048     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2049     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2050     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2051
2052     for(i=0; i<2; i++ ){
2053         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2054             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2055             break;
2056         }
2057
2058         for(q=0; q<52; q++){
2059             int shift = div6[q];
2060             int idx = rem6[q];
2061             for(x=0; x<64; x++)
2062                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2063                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2064                     h->pps.scaling_matrix8[i][x]) << shift;
2065         }
2066     }
2067 }
2068
2069 static void init_dequant4_coeff_table(H264Context *h){
2070     int i,j,q,x;
2071     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2072     for(i=0; i<6; i++ ){
2073         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2074         for(j=0; j<i; j++){
2075             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2076                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2077                 break;
2078             }
2079         }
2080         if(j<i)
2081             continue;
2082
2083         for(q=0; q<52; q++){
2084             int shift = div6[q] + 2;
2085             int idx = rem6[q];
2086             for(x=0; x<16; x++)
2087                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2088                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2089                     h->pps.scaling_matrix4[i][x]) << shift;
2090         }
2091     }
2092 }
2093
2094 static void init_dequant_tables(H264Context *h){
2095     int i,x;
2096     init_dequant4_coeff_table(h);
2097     if(h->pps.transform_8x8_mode)
2098         init_dequant8_coeff_table(h);
2099     if(h->sps.transform_bypass){
2100         for(i=0; i<6; i++)
2101             for(x=0; x<16; x++)
2102                 h->dequant4_coeff[i][0][x] = 1<<6;
2103         if(h->pps.transform_8x8_mode)
2104             for(i=0; i<2; i++)
2105                 for(x=0; x<64; x++)
2106                     h->dequant8_coeff[i][0][x] = 1<<6;
2107     }
2108 }
2109
2110
2111 /**
2112  * allocates tables.
2113  * needs width/height
2114  */
2115 static int alloc_tables(H264Context *h){
2116     MpegEncContext * const s = &h->s;
2117     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2118     int x,y;
2119
2120     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2121
2122     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2123     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2124     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2125
2126     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2127     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2128     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2129     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2130
2131     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2132     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2133
2134     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2135     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2136     for(y=0; y<s->mb_height; y++){
2137         for(x=0; x<s->mb_width; x++){
2138             const int mb_xy= x + y*s->mb_stride;
2139             const int b_xy = 4*x + 4*y*h->b_stride;
2140             const int b8_xy= 2*x + 2*y*h->b8_stride;
2141
2142             h->mb2b_xy [mb_xy]= b_xy;
2143             h->mb2b8_xy[mb_xy]= b8_xy;
2144         }
2145     }
2146
2147     s->obmc_scratchpad = NULL;
2148
2149     if(!h->dequant4_coeff[0])
2150         init_dequant_tables(h);
2151
2152     return 0;
2153 fail:
2154     free_tables(h);
2155     return -1;
2156 }
2157
2158 /**
2159  * Mimic alloc_tables(), but for every context thread.
2160  */
2161 static void clone_tables(H264Context *dst, H264Context *src){
2162     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2163     dst->non_zero_count           = src->non_zero_count;
2164     dst->slice_table              = src->slice_table;
2165     dst->cbp_table                = src->cbp_table;
2166     dst->mb2b_xy                  = src->mb2b_xy;
2167     dst->mb2b8_xy                 = src->mb2b8_xy;
2168     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2169     dst->mvd_table[0]             = src->mvd_table[0];
2170     dst->mvd_table[1]             = src->mvd_table[1];
2171     dst->direct_table             = src->direct_table;
2172
2173     dst->s.obmc_scratchpad = NULL;
2174     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2175 }
2176
2177 /**
2178  * Init context
2179  * Allocate buffers which are not shared amongst multiple threads.
2180  */
2181 static int context_init(H264Context *h){
2182     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2183     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2184
2185     return 0;
2186 fail:
2187     return -1; // free_tables will clean up for us
2188 }
2189
2190 static av_cold void common_init(H264Context *h){
2191     MpegEncContext * const s = &h->s;
2192
2193     s->width = s->avctx->width;
2194     s->height = s->avctx->height;
2195     s->codec_id= s->avctx->codec->id;
2196
2197     ff_h264_pred_init(&h->hpc, s->codec_id);
2198
2199     h->dequant_coeff_pps= -1;
2200     s->unrestricted_mv=1;
2201     s->decode=1; //FIXME
2202
2203     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2204     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2205 }
2206
2207 static av_cold int decode_init(AVCodecContext *avctx){
2208     H264Context *h= avctx->priv_data;
2209     MpegEncContext * const s = &h->s;
2210
2211     MPV_decode_defaults(s);
2212
2213     s->avctx = avctx;
2214     common_init(h);
2215
2216     s->out_format = FMT_H264;
2217     s->workaround_bugs= avctx->workaround_bugs;
2218
2219     // set defaults
2220 //    s->decode_mb= ff_h263_decode_mb;
2221     s->quarter_sample = 1;
2222     s->low_delay= 1;
2223
2224     if(avctx->codec_id == CODEC_ID_SVQ3)
2225         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2226     else
2227         avctx->pix_fmt= PIX_FMT_YUV420P;
2228
2229     decode_init_vlc();
2230
2231     if(avctx->extradata_size > 0 && avctx->extradata &&
2232        *(char *)avctx->extradata == 1){
2233         h->is_avc = 1;
2234         h->got_avcC = 0;
2235     } else {
2236         h->is_avc = 0;
2237     }
2238
2239     h->thread_context[0] = h;
2240     h->outputed_poc = INT_MIN;
2241     h->prev_poc_msb= 1<<16;
2242     return 0;
2243 }
2244
2245 static int frame_start(H264Context *h){
2246     MpegEncContext * const s = &h->s;
2247     int i;
2248
2249     if(MPV_frame_start(s, s->avctx) < 0)
2250         return -1;
2251     ff_er_frame_start(s);
2252     /*
2253      * MPV_frame_start uses pict_type to derive key_frame.
2254      * This is incorrect for H.264; IDR markings must be used.
2255      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2256      * See decode_nal_units().
2257      */
2258     s->current_picture_ptr->key_frame= 0;
2259
2260     assert(s->linesize && s->uvlinesize);
2261
2262     for(i=0; i<16; i++){
2263         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2264         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2265     }
2266     for(i=0; i<4; i++){
2267         h->block_offset[16+i]=
2268         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2269         h->block_offset[24+16+i]=
2270         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2271     }
2272
2273     /* can't be in alloc_tables because linesize isn't known there.
2274      * FIXME: redo bipred weight to not require extra buffer? */
2275     for(i = 0; i < s->avctx->thread_count; i++)
2276         if(!h->thread_context[i]->s.obmc_scratchpad)
2277             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2278
2279     /* some macroblocks will be accessed before they're available */
2280     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2281         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2282
2283 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2284
2285     // We mark the current picture as non-reference after allocating it, so
2286     // that if we break out due to an error it can be released automatically
2287     // in the next MPV_frame_start().
2288     // SVQ3 as well as most other codecs have only last/next/current and thus
2289     // get released even with set reference, besides SVQ3 and others do not
2290     // mark frames as reference later "naturally".
2291     if(s->codec_id != CODEC_ID_SVQ3)
2292         s->current_picture_ptr->reference= 0;
2293
2294     s->current_picture_ptr->field_poc[0]=
2295     s->current_picture_ptr->field_poc[1]= INT_MAX;
2296     assert(s->current_picture_ptr->long_ref==0);
2297
2298     return 0;
2299 }
2300
2301 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2302     MpegEncContext * const s = &h->s;
2303     int i;
2304     int step    = 1;
2305     int offset  = 1;
2306     int uvoffset= 1;
2307     int top_idx = 1;
2308     int skiplast= 0;
2309
2310     src_y  -=   linesize;
2311     src_cb -= uvlinesize;
2312     src_cr -= uvlinesize;
2313
2314     if(!simple && FRAME_MBAFF){
2315         if(s->mb_y&1){
2316             offset  = MB_MBAFF ? 1 : 17;
2317             uvoffset= MB_MBAFF ? 1 : 9;
2318             if(!MB_MBAFF){
2319                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2320                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2321                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2322                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2323                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2324                 }
2325             }
2326         }else{
2327             if(!MB_MBAFF){
2328                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2329                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2330                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2331                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2332                 }
2333                 skiplast= 1;
2334             }
2335             offset  =
2336             uvoffset=
2337             top_idx = MB_MBAFF ? 0 : 1;
2338         }
2339         step= MB_MBAFF ? 2 : 1;
2340     }
2341
2342     // There are two lines saved, the line above the the top macroblock of a pair,
2343     // and the line above the bottom macroblock
2344     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2345     for(i=1; i<17 - skiplast; i++){
2346         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2347     }
2348
2349     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2350     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2351
2352     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2353         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2354         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2355         for(i=1; i<9 - skiplast; i++){
2356             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2357             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2358         }
2359         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2360         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2361     }
2362 }
2363
2364 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2365     MpegEncContext * const s = &h->s;
2366     int temp8, i;
2367     uint64_t temp64;
2368     int deblock_left;
2369     int deblock_top;
2370     int mb_xy;
2371     int step    = 1;
2372     int offset  = 1;
2373     int uvoffset= 1;
2374     int top_idx = 1;
2375
2376     if(!simple && FRAME_MBAFF){
2377         if(s->mb_y&1){
2378             offset  = MB_MBAFF ? 1 : 17;
2379             uvoffset= MB_MBAFF ? 1 : 9;
2380         }else{
2381             offset  =
2382             uvoffset=
2383             top_idx = MB_MBAFF ? 0 : 1;
2384         }
2385         step= MB_MBAFF ? 2 : 1;
2386     }
2387
2388     if(h->deblocking_filter == 2) {
2389         mb_xy = h->mb_xy;
2390         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2391         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2392     } else {
2393         deblock_left = (s->mb_x > 0);
2394         deblock_top =  (s->mb_y > !!MB_FIELD);
2395     }
2396
2397     src_y  -=   linesize + 1;
2398     src_cb -= uvlinesize + 1;
2399     src_cr -= uvlinesize + 1;
2400
2401 #define XCHG(a,b,t,xchg)\
2402 t= a;\
2403 if(xchg)\
2404     a= b;\
2405 b= t;
2406
2407     if(deblock_left){
2408         for(i = !deblock_top; i<16; i++){
2409             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2410         }
2411         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2412     }
2413
2414     if(deblock_top){
2415         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2416         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2417         if(s->mb_x+1 < s->mb_width){
2418             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2419         }
2420     }
2421
2422     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2423         if(deblock_left){
2424             for(i = !deblock_top; i<8; i++){
2425                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2426                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2427             }
2428             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2429             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2430         }
2431         if(deblock_top){
2432             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2433             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2434         }
2435     }
2436 }
2437
2438 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2439     MpegEncContext * const s = &h->s;
2440     const int mb_x= s->mb_x;
2441     const int mb_y= s->mb_y;
2442     const int mb_xy= h->mb_xy;
2443     const int mb_type= s->current_picture.mb_type[mb_xy];
2444     uint8_t  *dest_y, *dest_cb, *dest_cr;
2445     int linesize, uvlinesize /*dct_offset*/;
2446     int i;
2447     int *block_offset = &h->block_offset[0];
2448     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2449     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2450     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2451
2452     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2453     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2454     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2455
2456     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2457     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2458
2459     if (!simple && MB_FIELD) {
2460         linesize   = h->mb_linesize   = s->linesize * 2;
2461         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2462         block_offset = &h->block_offset[24];
2463         if(mb_y&1){ //FIXME move out of this function?
2464             dest_y -= s->linesize*15;
2465             dest_cb-= s->uvlinesize*7;
2466             dest_cr-= s->uvlinesize*7;
2467         }
2468         if(FRAME_MBAFF) {
2469             int list;
2470             for(list=0; list<h->list_count; list++){
2471                 if(!USES_LIST(mb_type, list))
2472                     continue;
2473                 if(IS_16X16(mb_type)){
2474                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2475                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2476                 }else{
2477                     for(i=0; i<16; i+=4){
2478                         int ref = h->ref_cache[list][scan8[i]];
2479                         if(ref >= 0)
2480                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2481                     }
2482                 }
2483             }
2484         }
2485     } else {
2486         linesize   = h->mb_linesize   = s->linesize;
2487         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2488 //        dct_offset = s->linesize * 16;
2489     }
2490
2491     if(transform_bypass){
2492         idct_dc_add =
2493         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2494     }else if(IS_8x8DCT(mb_type)){
2495         idct_dc_add = s->dsp.h264_idct8_dc_add;
2496         idct_add = s->dsp.h264_idct8_add;
2497     }else{
2498         idct_dc_add = s->dsp.h264_idct_dc_add;
2499         idct_add = s->dsp.h264_idct_add;
2500     }
2501
2502     if (!simple && IS_INTRA_PCM(mb_type)) {
2503         for (i=0; i<16; i++) {
2504             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2505         }
2506         for (i=0; i<8; i++) {
2507             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2508             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2509         }
2510     } else {
2511         if(IS_INTRA(mb_type)){
2512             if(h->deblocking_filter)
2513                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2514
2515             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2516                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2517                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2518             }
2519
2520             if(IS_INTRA4x4(mb_type)){
2521                 if(simple || !s->encoding){
2522                     if(IS_8x8DCT(mb_type)){
2523                         for(i=0; i<16; i+=4){
2524                             uint8_t * const ptr= dest_y + block_offset[i];
2525                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2526                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2527                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2528                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2529                             if(nnz){
2530                                 if(nnz == 1 && h->mb[i*16])
2531                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2532                                 else
2533                                     idct_add(ptr, h->mb + i*16, linesize);
2534                             }
2535                         }
2536                     }else
2537                     for(i=0; i<16; i++){
2538                         uint8_t * const ptr= dest_y + block_offset[i];
2539                         uint8_t *topright;
2540                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2541                         int nnz, tr;
2542
2543                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2544                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2545                             assert(mb_y || linesize <= block_offset[i]);
2546                             if(!topright_avail){
2547                                 tr= ptr[3 - linesize]*0x01010101;
2548                                 topright= (uint8_t*) &tr;
2549                             }else
2550                                 topright= ptr + 4 - linesize;
2551                         }else
2552                             topright= NULL;
2553
2554                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2555                         nnz = h->non_zero_count_cache[ scan8[i] ];
2556                         if(nnz){
2557                             if(is_h264){
2558                                 if(nnz == 1 && h->mb[i*16])
2559                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2560                                 else
2561                                     idct_add(ptr, h->mb + i*16, linesize);
2562                             }else
2563                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2564                         }
2565                     }
2566                 }
2567             }else{
2568                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2569                 if(is_h264){
2570                     if(!transform_bypass)
2571                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2572                 }else
2573                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2574             }
2575             if(h->deblocking_filter)
2576                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2577         }else if(is_h264){
2578             hl_motion(h, dest_y, dest_cb, dest_cr,
2579                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2580                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2581                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2582         }
2583
2584
2585         if(!IS_INTRA4x4(mb_type)){
2586             if(is_h264){
2587                 if(IS_INTRA16x16(mb_type)){
2588                     for(i=0; i<16; i++){
2589                         if(h->non_zero_count_cache[ scan8[i] ])
2590                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2591                         else if(h->mb[i*16])
2592                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2593                     }
2594                 }else{
2595                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2596                     for(i=0; i<16; i+=di){
2597                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2598                         if(nnz){
2599                             if(nnz==1 && h->mb[i*16])
2600                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2601                             else
2602                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2603                         }
2604                     }
2605                 }
2606             }else{
2607                 for(i=0; i<16; i++){
2608                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2609                         uint8_t * const ptr= dest_y + block_offset[i];
2610                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2611                     }
2612                 }
2613             }
2614         }
2615
2616         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2617             uint8_t *dest[2] = {dest_cb, dest_cr};
2618             if(transform_bypass){
2619                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2620             }else{
2621                 idct_add = s->dsp.h264_idct_add;
2622                 idct_dc_add = s->dsp.h264_idct_dc_add;
2623                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2624                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2625             }
2626             if(is_h264){
2627                 for(i=16; i<16+8; i++){
2628                     if(h->non_zero_count_cache[ scan8[i] ])
2629                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2630                     else if(h->mb[i*16])
2631                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2632                 }
2633             }else{
2634                 for(i=16; i<16+8; i++){
2635                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2636                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2637                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2638                     }
2639                 }
2640             }
2641         }
2642     }
2643     if(h->deblocking_filter) {
2644         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2645         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2646         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2647         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2648         if (!simple && FRAME_MBAFF) {
2649             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2650         } else {
2651             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2652         }
2653     }
2654 }
2655
2656 /**
2657  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2658  */
2659 static void hl_decode_mb_simple(H264Context *h){
2660     hl_decode_mb_internal(h, 1);
2661 }
2662
2663 /**
2664  * Process a macroblock; this handles edge cases, such as interlacing.
2665  */
2666 static void av_noinline hl_decode_mb_complex(H264Context *h){
2667     hl_decode_mb_internal(h, 0);
2668 }
2669
2670 static void hl_decode_mb(H264Context *h){
2671     MpegEncContext * const s = &h->s;
2672     const int mb_xy= h->mb_xy;
2673     const int mb_type= s->current_picture.mb_type[mb_xy];
2674     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2675                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2676
2677     if(ENABLE_H264_ENCODER && !s->decode)
2678         return;
2679
2680     if (is_complex)
2681         hl_decode_mb_complex(h);
2682     else hl_decode_mb_simple(h);
2683 }
2684
2685 static void pic_as_field(Picture *pic, const int parity){
2686     int i;
2687     for (i = 0; i < 4; ++i) {
2688         if (parity == PICT_BOTTOM_FIELD)
2689             pic->data[i] += pic->linesize[i];
2690         pic->reference = parity;
2691         pic->linesize[i] *= 2;
2692     }
2693     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2694 }
2695
2696 static int split_field_copy(Picture *dest, Picture *src,
2697                             int parity, int id_add){
2698     int match = !!(src->reference & parity);
2699
2700     if (match) {
2701         *dest = *src;
2702         if(parity != PICT_FRAME){
2703             pic_as_field(dest, parity);
2704             dest->pic_id *= 2;
2705             dest->pic_id += id_add;
2706         }
2707     }
2708
2709     return match;
2710 }
2711
2712 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2713     int i[2]={0};
2714     int index=0;
2715
2716     while(i[0]<len || i[1]<len){
2717         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2718             i[0]++;
2719         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2720             i[1]++;
2721         if(i[0] < len){
2722             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2723             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2724         }
2725         if(i[1] < len){
2726             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2727             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2728         }
2729     }
2730
2731     return index;
2732 }
2733
2734 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2735     int i, best_poc;
2736     int out_i= 0;
2737
2738     for(;;){
2739         best_poc= dir ? INT_MIN : INT_MAX;
2740
2741         for(i=0; i<len; i++){
2742             const int poc= src[i]->poc;
2743             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2744                 best_poc= poc;
2745                 sorted[out_i]= src[i];
2746             }
2747         }
2748         if(best_poc == (dir ? INT_MIN : INT_MAX))
2749             break;
2750         limit= sorted[out_i++]->poc - dir;
2751     }
2752     return out_i;
2753 }
2754
2755 /**
2756  * fills the default_ref_list.
2757  */
2758 static int fill_default_ref_list(H264Context *h){
2759     MpegEncContext * const s = &h->s;
2760     int i, len;
2761
2762     if(h->slice_type_nos==FF_B_TYPE){
2763         Picture *sorted[32];
2764         int cur_poc, list;
2765         int lens[2];
2766
2767         if(FIELD_PICTURE)
2768             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2769         else
2770             cur_poc= s->current_picture_ptr->poc;
2771
2772         for(list= 0; list<2; list++){
2773             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2774             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2775             assert(len<=32);
2776             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2777             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2778             assert(len<=32);
2779
2780             if(len < h->ref_count[list])
2781                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2782             lens[list]= len;
2783         }
2784
2785         if(lens[0] == lens[1] && lens[1] > 1){
2786             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2787             if(i == lens[0])
2788                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2789         }
2790     }else{
2791         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2792         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2793         assert(len <= 32);
2794         if(len < h->ref_count[0])
2795             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2796     }
2797 #ifdef TRACE
2798     for (i=0; i<h->ref_count[0]; i++) {
2799         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2800     }
2801     if(h->slice_type_nos==FF_B_TYPE){
2802         for (i=0; i<h->ref_count[1]; i++) {
2803             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2804         }
2805     }
2806 #endif
2807     return 0;
2808 }
2809
2810 static void print_short_term(H264Context *h);
2811 static void print_long_term(H264Context *h);
2812
2813 /**
2814  * Extract structure information about the picture described by pic_num in
2815  * the current decoding context (frame or field). Note that pic_num is
2816  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2817  * @param pic_num picture number for which to extract structure information
2818  * @param structure one of PICT_XXX describing structure of picture
2819  *                      with pic_num
2820  * @return frame number (short term) or long term index of picture
2821  *         described by pic_num
2822  */
2823 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2824     MpegEncContext * const s = &h->s;
2825
2826     *structure = s->picture_structure;
2827     if(FIELD_PICTURE){
2828         if (!(pic_num & 1))
2829             /* opposite field */
2830             *structure ^= PICT_FRAME;
2831         pic_num >>= 1;
2832     }
2833
2834     return pic_num;
2835 }
2836
2837 static int decode_ref_pic_list_reordering(H264Context *h){
2838     MpegEncContext * const s = &h->s;
2839     int list, index, pic_structure;
2840
2841     print_short_term(h);
2842     print_long_term(h);
2843
2844     for(list=0; list<h->list_count; list++){
2845         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2846
2847         if(get_bits1(&s->gb)){
2848             int pred= h->curr_pic_num;
2849
2850             for(index=0; ; index++){
2851                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2852                 unsigned int pic_id;
2853                 int i;
2854                 Picture *ref = NULL;
2855
2856                 if(reordering_of_pic_nums_idc==3)
2857                     break;
2858
2859                 if(index >= h->ref_count[list]){
2860                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2861                     return -1;
2862                 }
2863
2864                 if(reordering_of_pic_nums_idc<3){
2865                     if(reordering_of_pic_nums_idc<2){
2866                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2867                         int frame_num;
2868
2869                         if(abs_diff_pic_num > h->max_pic_num){
2870                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2871                             return -1;
2872                         }
2873
2874                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2875                         else                                pred+= abs_diff_pic_num;
2876                         pred &= h->max_pic_num - 1;
2877
2878                         frame_num = pic_num_extract(h, pred, &pic_structure);
2879
2880                         for(i= h->short_ref_count-1; i>=0; i--){
2881                             ref = h->short_ref[i];
2882                             assert(ref->reference);
2883                             assert(!ref->long_ref);
2884                             if(
2885                                    ref->frame_num == frame_num &&
2886                                    (ref->reference & pic_structure)
2887                               )
2888                                 break;
2889                         }
2890                         if(i>=0)
2891                             ref->pic_id= pred;
2892                     }else{
2893                         int long_idx;
2894                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2895
2896                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2897
2898                         if(long_idx>31){
2899                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2900                             return -1;
2901                         }
2902                         ref = h->long_ref[long_idx];
2903                         assert(!(ref && !ref->reference));
2904                         if(ref && (ref->reference & pic_structure)){
2905                             ref->pic_id= pic_id;
2906                             assert(ref->long_ref);
2907                             i=0;
2908                         }else{
2909                             i=-1;
2910                         }
2911                     }
2912
2913                     if (i < 0) {
2914                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2915                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2916                     } else {
2917                         for(i=index; i+1<h->ref_count[list]; i++){
2918                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2919                                 break;
2920                         }
2921                         for(; i > index; i--){
2922                             h->ref_list[list][i]= h->ref_list[list][i-1];
2923                         }
2924                         h->ref_list[list][index]= *ref;
2925                         if (FIELD_PICTURE){
2926                             pic_as_field(&h->ref_list[list][index], pic_structure);
2927                         }
2928                     }
2929                 }else{
2930                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2931                     return -1;
2932                 }
2933             }
2934         }
2935     }
2936     for(list=0; list<h->list_count; list++){
2937         for(index= 0; index < h->ref_count[list]; index++){
2938             if(!h->ref_list[list][index].data[0]){
2939                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2940                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2941             }
2942         }
2943     }
2944
2945     return 0;
2946 }
2947
2948 static void fill_mbaff_ref_list(H264Context *h){
2949     int list, i, j;
2950     for(list=0; list<2; list++){ //FIXME try list_count
2951         for(i=0; i<h->ref_count[list]; i++){
2952             Picture *frame = &h->ref_list[list][i];
2953             Picture *field = &h->ref_list[list][16+2*i];
2954             field[0] = *frame;
2955             for(j=0; j<3; j++)
2956                 field[0].linesize[j] <<= 1;
2957             field[0].reference = PICT_TOP_FIELD;
2958             field[0].poc= field[0].field_poc[0];
2959             field[1] = field[0];
2960             for(j=0; j<3; j++)
2961                 field[1].data[j] += frame->linesize[j];
2962             field[1].reference = PICT_BOTTOM_FIELD;
2963             field[1].poc= field[1].field_poc[1];
2964
2965             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2966             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2967             for(j=0; j<2; j++){
2968                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2969                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2970             }
2971         }
2972     }
2973     for(j=0; j<h->ref_count[1]; j++){
2974         for(i=0; i<h->ref_count[0]; i++)
2975             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2976         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2977         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2978     }
2979 }
2980
2981 static int pred_weight_table(H264Context *h){
2982     MpegEncContext * const s = &h->s;
2983     int list, i;
2984     int luma_def, chroma_def;
2985
2986     h->use_weight= 0;
2987     h->use_weight_chroma= 0;
2988     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2989     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2990     luma_def = 1<<h->luma_log2_weight_denom;
2991     chroma_def = 1<<h->chroma_log2_weight_denom;
2992
2993     for(list=0; list<2; list++){
2994         for(i=0; i<h->ref_count[list]; i++){
2995             int luma_weight_flag, chroma_weight_flag;
2996
2997             luma_weight_flag= get_bits1(&s->gb);
2998             if(luma_weight_flag){
2999                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3000                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3001                 if(   h->luma_weight[list][i] != luma_def
3002                    || h->luma_offset[list][i] != 0)
3003                     h->use_weight= 1;
3004             }else{
3005                 h->luma_weight[list][i]= luma_def;
3006                 h->luma_offset[list][i]= 0;
3007             }
3008
3009             if(CHROMA){
3010                 chroma_weight_flag= get_bits1(&s->gb);
3011                 if(chroma_weight_flag){
3012                     int j;
3013                     for(j=0; j<2; j++){
3014                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3015                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3016                         if(   h->chroma_weight[list][i][j] != chroma_def
3017                         || h->chroma_offset[list][i][j] != 0)
3018                             h->use_weight_chroma= 1;
3019                     }
3020                 }else{
3021                     int j;
3022                     for(j=0; j<2; j++){
3023                         h->chroma_weight[list][i][j]= chroma_def;
3024                         h->chroma_offset[list][i][j]= 0;
3025                     }
3026                 }
3027             }
3028         }
3029         if(h->slice_type_nos != FF_B_TYPE) break;
3030     }
3031     h->use_weight= h->use_weight || h->use_weight_chroma;
3032     return 0;
3033 }
3034
3035 static void implicit_weight_table(H264Context *h){
3036     MpegEncContext * const s = &h->s;
3037     int ref0, ref1;
3038     int cur_poc = s->current_picture_ptr->poc;
3039
3040     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3041        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3042         h->use_weight= 0;
3043         h->use_weight_chroma= 0;
3044         return;
3045     }
3046
3047     h->use_weight= 2;
3048     h->use_weight_chroma= 2;
3049     h->luma_log2_weight_denom= 5;
3050     h->chroma_log2_weight_denom= 5;
3051
3052     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3053         int poc0 = h->ref_list[0][ref0].poc;
3054         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3055             int poc1 = h->ref_list[1][ref1].poc;
3056             int td = av_clip(poc1 - poc0, -128, 127);
3057             if(td){
3058                 int tb = av_clip(cur_poc - poc0, -128, 127);
3059                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3060                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3061                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3062                     h->implicit_weight[ref0][ref1] = 32;
3063                 else
3064                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3065             }else
3066                 h->implicit_weight[ref0][ref1] = 32;
3067         }
3068     }
3069 }
3070
3071 /**
3072  * Mark a picture as no longer needed for reference. The refmask
3073  * argument allows unreferencing of individual fields or the whole frame.
3074  * If the picture becomes entirely unreferenced, but is being held for
3075  * display purposes, it is marked as such.
3076  * @param refmask mask of fields to unreference; the mask is bitwise
3077  *                anded with the reference marking of pic
3078  * @return non-zero if pic becomes entirely unreferenced (except possibly
3079  *         for display purposes) zero if one of the fields remains in
3080  *         reference
3081  */
3082 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3083     int i;
3084     if (pic->reference &= refmask) {
3085         return 0;
3086     } else {
3087         for(i = 0; h->delayed_pic[i]; i++)
3088             if(pic == h->delayed_pic[i]){
3089                 pic->reference=DELAYED_PIC_REF;
3090                 break;
3091             }
3092         return 1;
3093     }
3094 }
3095
3096 /**
3097  * instantaneous decoder refresh.
3098  */
3099 static void idr(H264Context *h){
3100     int i;
3101
3102     for(i=0; i<16; i++){
3103         remove_long(h, i, 0);
3104     }
3105     assert(h->long_ref_count==0);
3106
3107     for(i=0; i<h->short_ref_count; i++){
3108         unreference_pic(h, h->short_ref[i], 0);
3109         h->short_ref[i]= NULL;
3110     }
3111     h->short_ref_count=0;
3112     h->prev_frame_num= 0;
3113     h->prev_frame_num_offset= 0;
3114     h->prev_poc_msb=
3115     h->prev_poc_lsb= 0;
3116 }
3117
3118 /* forget old pics after a seek */
3119 static void flush_dpb(AVCodecContext *avctx){
3120     H264Context *h= avctx->priv_data;
3121     int i;
3122     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3123         if(h->delayed_pic[i])
3124             h->delayed_pic[i]->reference= 0;
3125         h->delayed_pic[i]= NULL;
3126     }
3127     h->outputed_poc= INT_MIN;
3128     idr(h);
3129     if(h->s.current_picture_ptr)
3130         h->s.current_picture_ptr->reference= 0;
3131     h->s.first_field= 0;
3132     ff_mpeg_flush(avctx);
3133 }
3134
3135 /**
3136  * Find a Picture in the short term reference list by frame number.
3137  * @param frame_num frame number to search for
3138  * @param idx the index into h->short_ref where returned picture is found
3139  *            undefined if no picture found.
3140  * @return pointer to the found picture, or NULL if no pic with the provided
3141  *                 frame number is found
3142  */
3143 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3144     MpegEncContext * const s = &h->s;
3145     int i;
3146
3147     for(i=0; i<h->short_ref_count; i++){
3148         Picture *pic= h->short_ref[i];
3149         if(s->avctx->debug&FF_DEBUG_MMCO)
3150             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3151         if(pic->frame_num == frame_num) {
3152             *idx = i;
3153             return pic;
3154         }
3155     }
3156     return NULL;
3157 }
3158
3159 /**
3160  * Remove a picture from the short term reference list by its index in
3161  * that list.  This does no checking on the provided index; it is assumed
3162  * to be valid. Other list entries are shifted down.
3163  * @param i index into h->short_ref of picture to remove.
3164  */
3165 static void remove_short_at_index(H264Context *h, int i){
3166     assert(i >= 0 && i < h->short_ref_count);
3167     h->short_ref[i]= NULL;
3168     if (--h->short_ref_count)
3169         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3170 }
3171
3172 /**
3173  *
3174  * @return the removed picture or NULL if an error occurs
3175  */
3176 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3177     MpegEncContext * const s = &h->s;
3178     Picture *pic;
3179     int i;
3180
3181     if(s->avctx->debug&FF_DEBUG_MMCO)
3182         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3183
3184     pic = find_short(h, frame_num, &i);
3185     if (pic){
3186         if(unreference_pic(h, pic, ref_mask))
3187         remove_short_at_index(h, i);
3188     }
3189
3190     return pic;
3191 }
3192
3193 /**
3194  * Remove a picture from the long term reference list by its index in
3195  * that list.
3196  * @return the removed picture or NULL if an error occurs
3197  */
3198 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3199     Picture *pic;
3200
3201     pic= h->long_ref[i];
3202     if (pic){
3203         if(unreference_pic(h, pic, ref_mask)){
3204             assert(h->long_ref[i]->long_ref == 1);
3205             h->long_ref[i]->long_ref= 0;
3206             h->long_ref[i]= NULL;
3207             h->long_ref_count--;
3208         }
3209     }
3210
3211     return pic;
3212 }
3213
3214 /**
3215  * print short term list
3216  */
3217 static void print_short_term(H264Context *h) {
3218     uint32_t i;
3219     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3220         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3221         for(i=0; i<h->short_ref_count; i++){
3222             Picture *pic= h->short_ref[i];
3223             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3224         }
3225     }
3226 }
3227
3228 /**
3229  * print long term list
3230  */
3231 static void print_long_term(H264Context *h) {
3232     uint32_t i;
3233     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3234         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3235         for(i = 0; i < 16; i++){
3236             Picture *pic= h->long_ref[i];
3237             if (pic) {
3238                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3239             }
3240         }
3241     }
3242 }
3243
3244 /**
3245  * Executes the reference picture marking (memory management control operations).
3246  */
3247 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3248     MpegEncContext * const s = &h->s;
3249     int i, j;
3250     int current_ref_assigned=0;
3251     Picture *pic;
3252
3253     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3254         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3255
3256     for(i=0; i<mmco_count; i++){
3257         int structure, frame_num;
3258         if(s->avctx->debug&FF_DEBUG_MMCO)
3259             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3260
3261         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3262            || mmco[i].opcode == MMCO_SHORT2LONG){
3263             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3264             pic = find_short(h, frame_num, &j);
3265             if(!pic){
3266                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3267                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3268                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3269                 continue;
3270             }
3271         }
3272
3273         switch(mmco[i].opcode){
3274         case MMCO_SHORT2UNUSED:
3275             if(s->avctx->debug&FF_DEBUG_MMCO)
3276                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3277             remove_short(h, frame_num, structure ^ PICT_FRAME);
3278             break;
3279         case MMCO_SHORT2LONG:
3280                 if (h->long_ref[mmco[i].long_arg] != pic)
3281                     remove_long(h, mmco[i].long_arg, 0);
3282
3283                 remove_short_at_index(h, j);
3284                 h->long_ref[ mmco[i].long_arg ]= pic;
3285                 if (h->long_ref[ mmco[i].long_arg ]){
3286                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3287                     h->long_ref_count++;
3288                 }
3289             break;
3290         case MMCO_LONG2UNUSED:
3291             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3292             pic = h->long_ref[j];
3293             if (pic) {
3294                 remove_long(h, j, structure ^ PICT_FRAME);
3295             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3296                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3297             break;
3298         case MMCO_LONG:
3299                     // Comment below left from previous code as it is an interresting note.
3300                     /* First field in pair is in short term list or
3301                      * at a different long term index.
3302                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3303                      * Report the problem and keep the pair where it is,
3304                      * and mark this field valid.
3305                      */
3306
3307             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3308                 remove_long(h, mmco[i].long_arg, 0);
3309
3310                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3311                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3312                 h->long_ref_count++;
3313             }
3314
3315             s->current_picture_ptr->reference |= s->picture_structure;
3316             current_ref_assigned=1;
3317             break;
3318         case MMCO_SET_MAX_LONG:
3319             assert(mmco[i].long_arg <= 16);
3320             // just remove the long term which index is greater than new max
3321             for(j = mmco[i].long_arg; j<16; j++){
3322                 remove_long(h, j, 0);
3323             }
3324             break;
3325         case MMCO_RESET:
3326             while(h->short_ref_count){
3327                 remove_short(h, h->short_ref[0]->frame_num, 0);
3328             }
3329             for(j = 0; j < 16; j++) {
3330                 remove_long(h, j, 0);
3331             }
3332             s->current_picture_ptr->poc=
3333             s->current_picture_ptr->field_poc[0]=
3334             s->current_picture_ptr->field_poc[1]=
3335             h->poc_lsb=
3336             h->poc_msb=
3337             h->frame_num=
3338             s->current_picture_ptr->frame_num= 0;
3339             break;
3340         default: assert(0);
3341         }
3342     }
3343
3344     if (!current_ref_assigned) {
3345         /* Second field of complementary field pair; the first field of
3346          * which is already referenced. If short referenced, it
3347          * should be first entry in short_ref. If not, it must exist
3348          * in long_ref; trying to put it on the short list here is an
3349          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3350          */
3351         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3352             /* Just mark the second field valid */
3353             s->current_picture_ptr->reference = PICT_FRAME;
3354         } else if (s->current_picture_ptr->long_ref) {
3355             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3356                                              "assignment for second field "
3357                                              "in complementary field pair "
3358                                              "(first field is long term)\n");
3359         } else {
3360             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3361             if(pic){
3362                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3363             }
3364
3365             if(h->short_ref_count)
3366                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3367
3368             h->short_ref[0]= s->current_picture_ptr;
3369             h->short_ref_count++;
3370             s->current_picture_ptr->reference |= s->picture_structure;
3371         }
3372     }
3373
3374     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3375
3376         /* We have too many reference frames, probably due to corrupted
3377          * stream. Need to discard one frame. Prevents overrun of the
3378          * short_ref and long_ref buffers.
3379          */
3380         av_log(h->s.avctx, AV_LOG_ERROR,
3381                "number of reference frames exceeds max (probably "
3382                "corrupt input), discarding one\n");
3383
3384         if (h->long_ref_count && !h->short_ref_count) {
3385             for (i = 0; i < 16; ++i)
3386                 if (h->long_ref[i])
3387                     break;
3388
3389             assert(i < 16);
3390             remove_long(h, i, 0);
3391         } else {
3392             pic = h->short_ref[h->short_ref_count - 1];
3393             remove_short(h, pic->frame_num, 0);
3394         }
3395     }
3396
3397     print_short_term(h);
3398     print_long_term(h);
3399     return 0;
3400 }
3401
3402 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3403     MpegEncContext * const s = &h->s;
3404     int i;
3405
3406     h->mmco_index= 0;
3407     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3408         s->broken_link= get_bits1(gb) -1;
3409         if(get_bits1(gb)){
3410             h->mmco[0].opcode= MMCO_LONG;
3411             h->mmco[0].long_arg= 0;
3412             h->mmco_index= 1;
3413         }
3414     }else{
3415         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3416             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3417                 MMCOOpcode opcode= get_ue_golomb(gb);
3418
3419                 h->mmco[i].opcode= opcode;
3420                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3421                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3422 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3423                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3424                         return -1;
3425                     }*/
3426                 }
3427                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3428                     unsigned int long_arg= get_ue_golomb(gb);
3429                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3430                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3431                         return -1;
3432                     }
3433                     h->mmco[i].long_arg= long_arg;
3434                 }
3435
3436                 if(opcode > (unsigned)MMCO_LONG){
3437                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3438                     return -1;
3439                 }
3440                 if(opcode == MMCO_END)
3441                     break;
3442             }
3443             h->mmco_index= i;
3444         }else{
3445             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3446
3447             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3448                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3449                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3450                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3451                 h->mmco_index= 1;
3452                 if (FIELD_PICTURE) {
3453                     h->mmco[0].short_pic_num *= 2;
3454                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3455                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3456                     h->mmco_index= 2;
3457                 }
3458             }
3459         }
3460     }
3461
3462     return 0;
3463 }
3464
3465 static int init_poc(H264Context *h){
3466     MpegEncContext * const s = &h->s;
3467     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3468     int field_poc[2];
3469     Picture *cur = s->current_picture_ptr;
3470
3471     h->frame_num_offset= h->prev_frame_num_offset;
3472     if(h->frame_num < h->prev_frame_num)
3473         h->frame_num_offset += max_frame_num;
3474
3475     if(h->sps.poc_type==0){
3476         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3477
3478         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3479             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3480         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3481             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3482         else
3483             h->poc_msb = h->prev_poc_msb;
3484 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3485         field_poc[0] =
3486         field_poc[1] = h->poc_msb + h->poc_lsb;
3487         if(s->picture_structure == PICT_FRAME)
3488             field_poc[1] += h->delta_poc_bottom;
3489     }else if(h->sps.poc_type==1){
3490         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3491         int i;
3492
3493         if(h->sps.poc_cycle_length != 0)
3494             abs_frame_num = h->frame_num_offset + h->frame_num;
3495         else
3496             abs_frame_num = 0;
3497
3498         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3499             abs_frame_num--;
3500
3501         expected_delta_per_poc_cycle = 0;
3502         for(i=0; i < h->sps.poc_cycle_length; i++)
3503             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3504
3505         if(abs_frame_num > 0){
3506             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3507             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3508
3509             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3510             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3511                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3512         } else
3513             expectedpoc = 0;
3514
3515         if(h->nal_ref_idc == 0)
3516             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3517
3518         field_poc[0] = expectedpoc + h->delta_poc[0];
3519         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3520
3521         if(s->picture_structure == PICT_FRAME)
3522             field_poc[1] += h->delta_poc[1];
3523     }else{
3524         int poc= 2*(h->frame_num_offset + h->frame_num);
3525
3526         if(!h->nal_ref_idc)
3527             poc--;
3528
3529         field_poc[0]= poc;
3530         field_poc[1]= poc;
3531     }
3532
3533     if(s->picture_structure != PICT_BOTTOM_FIELD)
3534         s->current_picture_ptr->field_poc[0]= field_poc[0];
3535     if(s->picture_structure != PICT_TOP_FIELD)
3536         s->current_picture_ptr->field_poc[1]= field_poc[1];
3537     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3538
3539     return 0;
3540 }
3541
3542
3543 /**
3544  * initialize scan tables
3545  */
3546 static void init_scan_tables(H264Context *h){
3547     MpegEncContext * const s = &h->s;
3548     int i;
3549     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3550         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3551         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3552     }else{
3553         for(i=0; i<16; i++){
3554 #define T(x) (x>>2) | ((x<<2) & 0xF)
3555             h->zigzag_scan[i] = T(zigzag_scan[i]);
3556             h-> field_scan[i] = T( field_scan[i]);
3557 #undef T
3558         }
3559     }
3560     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3561         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3562         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3563         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3564         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3565     }else{
3566         for(i=0; i<64; i++){
3567 #define T(x) (x>>3) | ((x&7)<<3)
3568             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3569             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3570             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3571             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3572 #undef T
3573         }
3574     }
3575     if(h->sps.transform_bypass){ //FIXME same ugly
3576         h->zigzag_scan_q0          = zigzag_scan;
3577         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3578         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3579         h->field_scan_q0           = field_scan;
3580         h->field_scan8x8_q0        = field_scan8x8;
3581         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3582     }else{
3583         h->zigzag_scan_q0          = h->zigzag_scan;
3584         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3585         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3586         h->field_scan_q0           = h->field_scan;
3587         h->field_scan8x8_q0        = h->field_scan8x8;
3588         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3589     }
3590 }
3591
3592 /**
3593  * Replicates H264 "master" context to thread contexts.
3594  */
3595 static void clone_slice(H264Context *dst, H264Context *src)
3596 {
3597     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3598     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3599     dst->s.current_picture      = src->s.current_picture;
3600     dst->s.linesize             = src->s.linesize;
3601     dst->s.uvlinesize           = src->s.uvlinesize;
3602     dst->s.first_field          = src->s.first_field;
3603
3604     dst->prev_poc_msb           = src->prev_poc_msb;
3605     dst->prev_poc_lsb           = src->prev_poc_lsb;
3606     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3607     dst->prev_frame_num         = src->prev_frame_num;
3608     dst->short_ref_count        = src->short_ref_count;
3609
3610     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3611     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3612     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3613     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3614
3615     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3616     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3617 }
3618
3619 /**
3620  * decodes a slice header.
3621  * This will also call MPV_common_init() and frame_start() as needed.
3622  *
3623  * @param h h264context
3624  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3625  *
3626  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3627  */
3628 static int decode_slice_header(H264Context *h, H264Context *h0){
3629     MpegEncContext * const s = &h->s;
3630     MpegEncContext * const s0 = &h0->s;
3631     unsigned int first_mb_in_slice;
3632     unsigned int pps_id;
3633     int num_ref_idx_active_override_flag;
3634     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3635     unsigned int slice_type, tmp, i, j;
3636     int default_ref_list_done = 0;
3637     int last_pic_structure;
3638
3639     s->dropable= h->nal_ref_idc == 0;
3640
3641     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3642         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3643         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3644     }else{
3645         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3646         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3647     }
3648
3649     first_mb_in_slice= get_ue_golomb(&s->gb);
3650
3651     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3652         h0->current_slice = 0;
3653         if (!s0->first_field)
3654             s->current_picture_ptr= NULL;
3655     }
3656
3657     slice_type= get_ue_golomb(&s->gb);
3658     if(slice_type > 9){
3659         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3660         return -1;
3661     }
3662     if(slice_type > 4){
3663         slice_type -= 5;
3664         h->slice_type_fixed=1;
3665     }else
3666         h->slice_type_fixed=0;
3667
3668     slice_type= slice_type_map[ slice_type ];
3669     if (slice_type == FF_I_TYPE
3670         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3671         default_ref_list_done = 1;
3672     }
3673     h->slice_type= slice_type;
3674     h->slice_type_nos= slice_type & 3;
3675
3676     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3677     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3678         av_log(h->s.avctx, AV_LOG_ERROR,
3679                "B picture before any references, skipping\n");
3680         return -1;
3681     }
3682
3683     pps_id= get_ue_golomb(&s->gb);
3684     if(pps_id>=MAX_PPS_COUNT){
3685         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3686         return -1;
3687     }
3688     if(!h0->pps_buffers[pps_id]) {
3689         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3690         return -1;
3691     }
3692     h->pps= *h0->pps_buffers[pps_id];
3693
3694     if(!h0->sps_buffers[h->pps.sps_id]) {
3695         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3696         return -1;
3697     }
3698     h->sps = *h0->sps_buffers[h->pps.sps_id];
3699
3700     if(h == h0 && h->dequant_coeff_pps != pps_id){
3701         h->dequant_coeff_pps = pps_id;
3702         init_dequant_tables(h);
3703     }
3704
3705     s->mb_width= h->sps.mb_width;
3706     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3707
3708     h->b_stride=  s->mb_width*4;
3709     h->b8_stride= s->mb_width*2;
3710
3711     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3712     if(h->sps.frame_mbs_only_flag)
3713         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3714     else
3715         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3716
3717     if (s->context_initialized
3718         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3719         if(h != h0)
3720             return -1;   // width / height changed during parallelized decoding
3721         free_tables(h);
3722         MPV_common_end(s);
3723     }
3724     if (!s->context_initialized) {
3725         if(h != h0)
3726             return -1;  // we cant (re-)initialize context during parallel decoding
3727         if (MPV_common_init(s) < 0)
3728             return -1;
3729         s->first_field = 0;
3730
3731         init_scan_tables(h);
3732         alloc_tables(h);
3733
3734         for(i = 1; i < s->avctx->thread_count; i++) {
3735             H264Context *c;
3736             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3737             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3738             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3739             c->sps = h->sps;
3740             c->pps = h->pps;
3741             init_scan_tables(c);
3742             clone_tables(c, h);
3743         }
3744
3745         for(i = 0; i < s->avctx->thread_count; i++)
3746             if(context_init(h->thread_context[i]) < 0)
3747                 return -1;
3748
3749         s->avctx->width = s->width;
3750         s->avctx->height = s->height;
3751         s->avctx->sample_aspect_ratio= h->sps.sar;
3752         if(!s->avctx->sample_aspect_ratio.den)
3753             s->avctx->sample_aspect_ratio.den = 1;
3754
3755         if(h->sps.timing_info_present_flag){
3756             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3757             if(h->x264_build > 0 && h->x264_build < 44)
3758                 s->avctx->time_base.den *= 2;
3759             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3760                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3761         }
3762     }
3763
3764     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3765
3766     h->mb_mbaff = 0;
3767     h->mb_aff_frame = 0;
3768     last_pic_structure = s0->picture_structure;
3769     if(h->sps.frame_mbs_only_flag){
3770         s->picture_structure= PICT_FRAME;
3771     }else{
3772         if(get_bits1(&s->gb)) { //field_pic_flag
3773             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3774         } else {
3775             s->picture_structure= PICT_FRAME;
3776             h->mb_aff_frame = h->sps.mb_aff;
3777         }
3778     }
3779     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3780
3781     if(h0->current_slice == 0){
3782         while(h->frame_num !=  h->prev_frame_num &&
3783               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3784             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3785             frame_start(h);
3786             h->prev_frame_num++;
3787             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3788             s->current_picture_ptr->frame_num= h->prev_frame_num;
3789             execute_ref_pic_marking(h, NULL, 0);
3790         }
3791
3792         /* See if we have a decoded first field looking for a pair... */
3793         if (s0->first_field) {
3794             assert(s0->current_picture_ptr);
3795             assert(s0->current_picture_ptr->data[0]);
3796             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3797
3798             /* figure out if we have a complementary field pair */
3799             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3800                 /*
3801                  * Previous field is unmatched. Don't display it, but let it
3802                  * remain for reference if marked as such.
3803                  */
3804                 s0->current_picture_ptr = NULL;
3805                 s0->first_field = FIELD_PICTURE;
3806
3807             } else {
3808                 if (h->nal_ref_idc &&
3809                         s0->current_picture_ptr->reference &&
3810                         s0->current_picture_ptr->frame_num != h->frame_num) {
3811                     /*
3812                      * This and previous field were reference, but had
3813                      * different frame_nums. Consider this field first in
3814                      * pair. Throw away previous field except for reference
3815                      * purposes.
3816                      */
3817                     s0->first_field = 1;
3818                     s0->current_picture_ptr = NULL;
3819
3820                 } else {
3821                     /* Second field in complementary pair */
3822                     s0->first_field = 0;
3823                 }
3824             }
3825
3826         } else {
3827             /* Frame or first field in a potentially complementary pair */
3828             assert(!s0->current_picture_ptr);
3829             s0->first_field = FIELD_PICTURE;
3830         }
3831
3832         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3833             s0->first_field = 0;
3834             return -1;
3835         }
3836     }
3837     if(h != h0)
3838         clone_slice(h, h0);
3839
3840     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3841
3842     assert(s->mb_num == s->mb_width * s->mb_height);
3843     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3844        first_mb_in_slice                    >= s->mb_num){
3845         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3846         return -1;
3847     }
3848     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3849     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3850     if (s->picture_structure == PICT_BOTTOM_FIELD)
3851         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3852     assert(s->mb_y < s->mb_height);
3853
3854     if(s->picture_structure==PICT_FRAME){
3855         h->curr_pic_num=   h->frame_num;
3856         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3857     }else{
3858         h->curr_pic_num= 2*h->frame_num + 1;
3859         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3860     }
3861
3862     if(h->nal_unit_type == NAL_IDR_SLICE){
3863         get_ue_golomb(&s->gb); /* idr_pic_id */
3864     }
3865
3866     if(h->sps.poc_type==0){
3867         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3868
3869         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3870             h->delta_poc_bottom= get_se_golomb(&s->gb);
3871         }
3872     }
3873
3874     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3875         h->delta_poc[0]= get_se_golomb(&s->gb);
3876
3877         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3878             h->delta_poc[1]= get_se_golomb(&s->gb);
3879     }
3880
3881     init_poc(h);
3882
3883     if(h->pps.redundant_pic_cnt_present){
3884         h->redundant_pic_count= get_ue_golomb(&s->gb);
3885     }
3886
3887     //set defaults, might be overridden a few lines later
3888     h->ref_count[0]= h->pps.ref_count[0];
3889     h->ref_count[1]= h->pps.ref_count[1];
3890
3891     if(h->slice_type_nos != FF_I_TYPE){
3892         if(h->slice_type_nos == FF_B_TYPE){
3893             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3894         }
3895         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3896
3897         if(num_ref_idx_active_override_flag){
3898             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3899             if(h->slice_type_nos==FF_B_TYPE)
3900                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3901
3902             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3903                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3904                 h->ref_count[0]= h->ref_count[1]= 1;
3905                 return -1;
3906             }
3907         }
3908         if(h->slice_type_nos == FF_B_TYPE)
3909             h->list_count= 2;
3910         else
3911             h->list_count= 1;
3912     }else
3913         h->list_count= 0;
3914
3915     if(!default_ref_list_done){
3916         fill_default_ref_list(h);
3917     }
3918
3919     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3920         return -1;
3921
3922     if(h->slice_type_nos!=FF_I_TYPE){
3923         s->last_picture_ptr= &h->ref_list[0][0];
3924         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3925     }
3926     if(h->slice_type_nos==FF_B_TYPE){
3927         s->next_picture_ptr= &h->ref_list[1][0];
3928         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3929     }
3930
3931     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3932        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3933         pred_weight_table(h);
3934     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3935         implicit_weight_table(h);
3936     else
3937         h->use_weight = 0;
3938
3939     if(h->nal_ref_idc)
3940         decode_ref_pic_marking(h0, &s->gb);
3941
3942     if(FRAME_MBAFF)
3943         fill_mbaff_ref_list(h);
3944
3945     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3946         direct_dist_scale_factor(h);
3947     direct_ref_list_init(h);
3948
3949     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3950         tmp = get_ue_golomb(&s->gb);
3951         if(tmp > 2){
3952             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3953             return -1;
3954         }
3955         h->cabac_init_idc= tmp;
3956     }
3957
3958     h->last_qscale_diff = 0;
3959     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3960     if(tmp>51){
3961         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3962         return -1;
3963     }
3964     s->qscale= tmp;
3965     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3966     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3967     //FIXME qscale / qp ... stuff
3968     if(h->slice_type == FF_SP_TYPE){
3969         get_bits1(&s->gb); /* sp_for_switch_flag */
3970     }
3971     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3972         get_se_golomb(&s->gb); /* slice_qs_delta */
3973     }
3974
3975     h->deblocking_filter = 1;
3976     h->slice_alpha_c0_offset = 0;
3977     h->slice_beta_offset = 0;
3978     if( h->pps.deblocking_filter_parameters_present ) {
3979         tmp= get_ue_golomb(&s->gb);
3980         if(tmp > 2){
3981             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3982             return -1;
3983         }
3984         h->deblocking_filter= tmp;
3985         if(h->deblocking_filter < 2)
3986             h->deblocking_filter^= 1; // 1<->0
3987
3988         if( h->deblocking_filter ) {
3989             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3990             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3991         }
3992     }
3993
3994     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3995        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3996        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3997        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3998         h->deblocking_filter= 0;
3999
4000     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4001         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4002             /* Cheat slightly for speed:
4003                Do not bother to deblock across slices. */
4004             h->deblocking_filter = 2;
4005         } else {
4006             h0->max_contexts = 1;
4007             if(!h0->single_decode_warning) {
4008                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4009                 h0->single_decode_warning = 1;
4010             }
4011             if(h != h0)
4012                 return 1; // deblocking switched inside frame
4013         }
4014     }
4015
4016 #if 0 //FMO
4017     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4018         slice_group_change_cycle= get_bits(&s->gb, ?);
4019 #endif
4020
4021     h0->last_slice_type = slice_type;
4022     h->slice_num = ++h0->current_slice;
4023
4024     for(j=0; j<2; j++){
4025         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4026         ref2frm[0]=
4027         ref2frm[1]= -1;
4028         for(i=0; i<16; i++)
4029             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4030                           +(h->ref_list[j][i].reference&3);
4031         ref2frm[18+0]=
4032         ref2frm[18+1]= -1;
4033         for(i=16; i<48; i++)
4034             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4035                           +(h->ref_list[j][i].reference&3);
4036     }
4037
4038     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4039     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4040
4041     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4042         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4043                h->slice_num,
4044                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4045                first_mb_in_slice,
4046                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4047                pps_id, h->frame_num,
4048                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4049                h->ref_count[0], h->ref_count[1],
4050                s->qscale,
4051                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4052                h->use_weight,
4053                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4054                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4055                );
4056     }
4057
4058     return 0;
4059 }
4060
4061 /**
4062  *
4063  */
4064 static inline int get_level_prefix(GetBitContext *gb){
4065     unsigned int buf;
4066     int log;
4067
4068     OPEN_READER(re, gb);
4069     UPDATE_CACHE(re, gb);
4070     buf=GET_CACHE(re, gb);
4071
4072     log= 32 - av_log2(buf);
4073 #ifdef TRACE
4074     print_bin(buf>>(32-log), log);
4075     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4076 #endif
4077
4078     LAST_SKIP_BITS(re, gb, log);
4079     CLOSE_READER(re, gb);
4080
4081     return log-1;
4082 }
4083
4084 static inline int get_dct8x8_allowed(H264Context *h){
4085     int i;
4086     for(i=0; i<4; i++){
4087         if(!IS_SUB_8X8(h->sub_mb_type[i])
4088            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4089             return 0;
4090     }
4091     return 1;
4092 }
4093
4094 /**
4095  * decodes a residual block.
4096  * @param n block index
4097  * @param scantable scantable
4098  * @param max_coeff number of coefficients in the block
4099  * @return <0 if an error occurred
4100  */
4101 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4102     MpegEncContext * const s = &h->s;
4103     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4104     int level[16];
4105     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4106
4107     //FIXME put trailing_onex into the context
4108
4109     if(n == CHROMA_DC_BLOCK_INDEX){
4110         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4111         total_coeff= coeff_token>>2;
4112     }else{
4113         if(n == LUMA_DC_BLOCK_INDEX){
4114             total_coeff= pred_non_zero_count(h, 0);
4115             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4116             total_coeff= coeff_token>>2;
4117         }else{
4118             total_coeff= pred_non_zero_count(h, n);
4119             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4120             total_coeff= coeff_token>>2;
4121             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4122         }
4123     }
4124
4125     //FIXME set last_non_zero?
4126
4127     if(total_coeff==0)
4128         return 0;
4129     if(total_coeff > (unsigned)max_coeff) {
4130         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4131         return -1;
4132     }
4133
4134     trailing_ones= coeff_token&3;
4135     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4136     assert(total_coeff<=16);
4137
4138     for(i=0; i<trailing_ones; i++){
4139         level[i]= 1 - 2*get_bits1(gb);
4140     }
4141
4142     if(i<total_coeff) {
4143         int level_code, mask;
4144         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4145         int prefix= get_level_prefix(gb);
4146
4147         //first coefficient has suffix_length equal to 0 or 1
4148         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4149             if(suffix_length)
4150                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4151             else
4152                 level_code= (prefix<<suffix_length); //part
4153         }else if(prefix==14){
4154             if(suffix_length)
4155                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4156             else
4157                 level_code= prefix + get_bits(gb, 4); //part
4158         }else{
4159             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4160             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4161             if(prefix>=16)
4162                 level_code += (1<<(prefix-3))-4096;
4163         }
4164
4165         if(trailing_ones < 3) level_code += 2;
4166
4167         suffix_length = 1;
4168         if(level_code > 5)
4169             suffix_length++;
4170         mask= -(level_code&1);
4171         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4172         i++;
4173
4174         //remaining coefficients have suffix_length > 0
4175         for(;i<total_coeff;i++) {
4176             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4177             prefix = get_level_prefix(gb);
4178             if(prefix<15){
4179                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4180             }else{
4181                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4182                 if(prefix>=16)
4183                     level_code += (1<<(prefix-3))-4096;
4184             }
4185             mask= -(level_code&1);
4186             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4187             if(level_code > suffix_limit[suffix_length])
4188                 suffix_length++;
4189         }
4190     }
4191
4192     if(total_coeff == max_coeff)
4193         zeros_left=0;
4194     else{
4195         if(n == CHROMA_DC_BLOCK_INDEX)
4196             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4197         else
4198             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4199     }
4200
4201     coeff_num = zeros_left + total_coeff - 1;
4202     j = scantable[coeff_num];
4203     if(n > 24){
4204         block[j] = level[0];
4205         for(i=1;i<total_coeff;i++) {
4206             if(zeros_left <= 0)
4207                 run_before = 0;
4208             else if(zeros_left < 7){
4209                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4210             }else{
4211                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4212             }
4213             zeros_left -= run_before;
4214             coeff_num -= 1 + run_before;
4215             j= scantable[ coeff_num ];
4216
4217             block[j]= level[i];
4218         }
4219     }else{
4220         block[j] = (level[0] * qmul[j] + 32)>>6;
4221         for(i=1;i<total_coeff;i++) {
4222             if(zeros_left <= 0)
4223                 run_before = 0;
4224             else if(zeros_left < 7){
4225                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4226             }else{
4227                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4228             }
4229             zeros_left -= run_before;
4230             coeff_num -= 1 + run_before;
4231             j= scantable[ coeff_num ];
4232
4233             block[j]= (level[i] * qmul[j] + 32)>>6;
4234         }
4235     }
4236
4237     if(zeros_left<0){
4238         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4239         return -1;
4240     }
4241
4242     return 0;
4243 }
4244
4245 static void predict_field_decoding_flag(H264Context *h){
4246     MpegEncContext * const s = &h->s;
4247     const int mb_xy= h->mb_xy;
4248     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4249                 ? s->current_picture.mb_type[mb_xy-1]
4250                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4251                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4252                 : 0;
4253     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4254 }
4255
4256 /**
4257  * decodes a P_SKIP or B_SKIP macroblock
4258  */
4259 static void decode_mb_skip(H264Context *h){
4260     MpegEncContext * const s = &h->s;
4261     const int mb_xy= h->mb_xy;
4262     int mb_type=0;
4263
4264     memset(h->non_zero_count[mb_xy], 0, 16);
4265     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4266
4267     if(MB_FIELD)
4268         mb_type|= MB_TYPE_INTERLACED;
4269
4270     if( h->slice_type_nos == FF_B_TYPE )
4271     {
4272         // just for fill_caches. pred_direct_motion will set the real mb_type
4273         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4274
4275         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4276         pred_direct_motion(h, &mb_type);
4277         mb_type|= MB_TYPE_SKIP;
4278     }
4279     else
4280     {
4281         int mx, my;
4282         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4283
4284         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4285         pred_pskip_motion(h, &mx, &my);
4286         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4287         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4288     }
4289
4290     write_back_motion(h, mb_type);
4291     s->current_picture.mb_type[mb_xy]= mb_type;
4292     s->current_picture.qscale_table[mb_xy]= s->qscale;
4293     h->slice_table[ mb_xy ]= h->slice_num;
4294     h->prev_mb_skipped= 1;
4295 }
4296
4297 /**
4298  * decodes a macroblock
4299  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4300  */
4301 static int decode_mb_cavlc(H264Context *h){
4302     MpegEncContext * const s = &h->s;
4303     int mb_xy;
4304     int partition_count;
4305     unsigned int mb_type, cbp;
4306     int dct8x8_allowed= h->pps.transform_8x8_mode;
4307
4308     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4309
4310     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4311
4312     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4313     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4314                 down the code */
4315     if(h->slice_type_nos != FF_I_TYPE){
4316         if(s->mb_skip_run==-1)
4317             s->mb_skip_run= get_ue_golomb(&s->gb);
4318
4319         if (s->mb_skip_run--) {
4320             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4321                 if(s->mb_skip_run==0)
4322                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4323                 else
4324                     predict_field_decoding_flag(h);
4325             }
4326             decode_mb_skip(h);
4327             return 0;
4328         }
4329     }
4330     if(FRAME_MBAFF){
4331         if( (s->mb_y&1) == 0 )
4332             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4333     }
4334
4335     h->prev_mb_skipped= 0;
4336
4337     mb_type= get_ue_golomb(&s->gb);
4338     if(h->slice_type_nos == FF_B_TYPE){
4339         if(mb_type < 23){
4340             partition_count= b_mb_type_info[mb_type].partition_count;
4341             mb_type=         b_mb_type_info[mb_type].type;
4342         }else{
4343             mb_type -= 23;
4344             goto decode_intra_mb;
4345         }
4346     }else if(h->slice_type_nos == FF_P_TYPE){
4347         if(mb_type < 5){
4348             partition_count= p_mb_type_info[mb_type].partition_count;
4349             mb_type=         p_mb_type_info[mb_type].type;
4350         }else{
4351             mb_type -= 5;
4352             goto decode_intra_mb;
4353         }
4354     }else{
4355        assert(h->slice_type_nos == FF_I_TYPE);
4356         if(h->slice_type == FF_SI_TYPE && mb_type)
4357             mb_type--;
4358 decode_intra_mb:
4359         if(mb_type > 25){
4360             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4361             return -1;
4362         }
4363         partition_count=0;
4364         cbp= i_mb_type_info[mb_type].cbp;
4365         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4366         mb_type= i_mb_type_info[mb_type].type;
4367     }
4368
4369     if(MB_FIELD)
4370         mb_type |= MB_TYPE_INTERLACED;
4371
4372     h->slice_table[ mb_xy ]= h->slice_num;
4373
4374     if(IS_INTRA_PCM(mb_type)){
4375         unsigned int x;
4376
4377         // We assume these blocks are very rare so we do not optimize it.
4378         align_get_bits(&s->gb);
4379
4380         // The pixels are stored in the same order as levels in h->mb array.
4381         for(x=0; x < (CHROMA ? 384 : 256); x++){
4382             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4383         }
4384
4385         // In deblocking, the quantizer is 0
4386         s->current_picture.qscale_table[mb_xy]= 0;
4387         // All coeffs are present
4388         memset(h->non_zero_count[mb_xy], 16, 16);
4389
4390         s->current_picture.mb_type[mb_xy]= mb_type;
4391         return 0;
4392     }
4393
4394     if(MB_MBAFF){
4395         h->ref_count[0] <<= 1;
4396         h->ref_count[1] <<= 1;
4397     }
4398
4399     fill_caches(h, mb_type, 0);
4400
4401     //mb_pred
4402     if(IS_INTRA(mb_type)){
4403         int pred_mode;
4404 //            init_top_left_availability(h);
4405         if(IS_INTRA4x4(mb_type)){
4406             int i;
4407             int di = 1;
4408             if(dct8x8_allowed && get_bits1(&s->gb)){
4409                 mb_type |= MB_TYPE_8x8DCT;
4410                 di = 4;
4411             }
4412
4413 //                fill_intra4x4_pred_table(h);
4414             for(i=0; i<16; i+=di){
4415                 int mode= pred_intra_mode(h, i);
4416
4417                 if(!get_bits1(&s->gb)){
4418                     const int rem_mode= get_bits(&s->gb, 3);
4419                     mode = rem_mode + (rem_mode >= mode);
4420                 }
4421
4422                 if(di==4)
4423                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4424                 else
4425                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4426             }
4427             write_back_intra_pred_mode(h);
4428             if( check_intra4x4_pred_mode(h) < 0)
4429                 return -1;
4430         }else{
4431             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4432             if(h->intra16x16_pred_mode < 0)
4433                 return -1;
4434         }
4435         if(CHROMA){
4436             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4437             if(pred_mode < 0)
4438                 return -1;
4439             h->chroma_pred_mode= pred_mode;
4440         }
4441     }else if(partition_count==4){
4442         int i, j, sub_partition_count[4], list, ref[2][4];
4443
4444         if(h->slice_type_nos == FF_B_TYPE){
4445             for(i=0; i<4; i++){
4446                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4447                 if(h->sub_mb_type[i] >=13){
4448                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4449                     return -1;
4450                 }
4451                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4452                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4453             }
4454             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4455                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4456                 pred_direct_motion(h, &mb_type);
4457                 h->ref_cache[0][scan8[4]] =
4458                 h->ref_cache[1][scan8[4]] =
4459                 h->ref_cache[0][scan8[12]] =
4460                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4461             }
4462         }else{
4463             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4464             for(i=0; i<4; i++){
4465                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4466                 if(h->sub_mb_type[i] >=4){
4467                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4468                     return -1;
4469                 }
4470                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4471                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4472             }
4473         }
4474
4475         for(list=0; list<h->list_count; list++){
4476             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4477             for(i=0; i<4; i++){
4478                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4479                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4480                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4481                     if(tmp>=ref_count){
4482                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4483                         return -1;
4484                     }
4485                     ref[list][i]= tmp;
4486                 }else{
4487                  //FIXME
4488                     ref[list][i] = -1;
4489                 }
4490             }
4491         }
4492
4493         if(dct8x8_allowed)
4494             dct8x8_allowed = get_dct8x8_allowed(h);
4495
4496         for(list=0; list<h->list_count; list++){
4497             for(i=0; i<4; i++){
4498                 if(IS_DIRECT(h->sub_mb_type[i])) {
4499                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4500                     continue;
4501                 }
4502                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4503                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4504
4505                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4506                     const int sub_mb_type= h->sub_mb_type[i];
4507                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4508                     for(j=0; j<sub_partition_count[i]; j++){
4509                         int mx, my;
4510                         const int index= 4*i + block_width*j;
4511                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4512                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4513                         mx += get_se_golomb(&s->gb);
4514                         my += get_se_golomb(&s->gb);
4515                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4516
4517                         if(IS_SUB_8X8(sub_mb_type)){
4518                             mv_cache[ 1 ][0]=
4519                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4520                             mv_cache[ 1 ][1]=
4521                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4522                         }else if(IS_SUB_8X4(sub_mb_type)){
4523                             mv_cache[ 1 ][0]= mx;
4524                             mv_cache[ 1 ][1]= my;
4525                         }else if(IS_SUB_4X8(sub_mb_type)){
4526                             mv_cache[ 8 ][0]= mx;
4527                             mv_cache[ 8 ][1]= my;
4528                         }
4529                         mv_cache[ 0 ][0]= mx;
4530                         mv_cache[ 0 ][1]= my;
4531                     }
4532                 }else{
4533                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4534                     p[0] = p[1]=
4535                     p[8] = p[9]= 0;
4536                 }
4537             }
4538         }
4539     }else if(IS_DIRECT(mb_type)){
4540         pred_direct_motion(h, &mb_type);
4541         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4542     }else{
4543         int list, mx, my, i;
4544          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4545         if(IS_16X16(mb_type)){
4546             for(list=0; list<h->list_count; list++){
4547                     unsigned int val;
4548                     if(IS_DIR(mb_type, 0, list)){
4549                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4550                         if(val >= h->ref_count[list]){
4551                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4552                             return -1;
4553                         }
4554                     }else
4555                         val= LIST_NOT_USED&0xFF;
4556                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4557             }
4558             for(list=0; list<h->list_count; list++){
4559                 unsigned int val;
4560                 if(IS_DIR(mb_type, 0, list)){
4561                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4562                     mx += get_se_golomb(&s->gb);
4563                     my += get_se_golomb(&s->gb);
4564                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4565
4566                     val= pack16to32(mx,my);
4567                 }else
4568                     val=0;
4569                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4570             }
4571         }
4572         else if(IS_16X8(mb_type)){
4573             for(list=0; list<h->list_count; list++){
4574                     for(i=0; i<2; i++){
4575                         unsigned int val;
4576                         if(IS_DIR(mb_type, i, list)){
4577                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4578                             if(val >= h->ref_count[list]){
4579                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4580                                 return -1;
4581                             }
4582                         }else
4583                             val= LIST_NOT_USED&0xFF;
4584                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4585                     }
4586             }
4587             for(list=0; list<h->list_count; list++){
4588                 for(i=0; i<2; i++){
4589                     unsigned int val;
4590                     if(IS_DIR(mb_type, i, list)){
4591                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4592                         mx += get_se_golomb(&s->gb);
4593                         my += get_se_golomb(&s->gb);
4594                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4595
4596                         val= pack16to32(mx,my);
4597                     }else
4598                         val=0;
4599                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4600                 }
4601             }
4602         }else{
4603             assert(IS_8X16(mb_type));
4604             for(list=0; list<h->list_count; list++){
4605                     for(i=0; i<2; i++){
4606                         unsigned int val;
4607                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4608                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4609                             if(val >= h->ref_count[list]){
4610                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4611                                 return -1;
4612                             }
4613                         }else
4614                             val= LIST_NOT_USED&0xFF;
4615                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4616                     }
4617             }
4618             for(list=0; list<h->list_count; list++){
4619                 for(i=0; i<2; i++){
4620                     unsigned int val;
4621                     if(IS_DIR(mb_type, i, list)){
4622                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4623                         mx += get_se_golomb(&s->gb);
4624                         my += get_se_golomb(&s->gb);
4625                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4626
4627                         val= pack16to32(mx,my);
4628                     }else
4629                         val=0;
4630                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4631                 }
4632             }
4633         }
4634     }
4635
4636     if(IS_INTER(mb_type))
4637         write_back_motion(h, mb_type);
4638
4639     if(!IS_INTRA16x16(mb_type)){
4640         cbp= get_ue_golomb(&s->gb);
4641         if(cbp > 47){
4642             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4643             return -1;
4644         }
4645
4646         if(CHROMA){
4647             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4648             else                     cbp= golomb_to_inter_cbp   [cbp];
4649         }else{
4650             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4651             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4652         }
4653     }
4654     h->cbp = cbp;
4655
4656     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4657         if(get_bits1(&s->gb)){
4658             mb_type |= MB_TYPE_8x8DCT;
4659             h->cbp_table[mb_xy]= cbp;
4660         }
4661     }
4662     s->current_picture.mb_type[mb_xy]= mb_type;
4663
4664     if(cbp || IS_INTRA16x16(mb_type)){
4665         int i8x8, i4x4, chroma_idx;
4666         int dquant;
4667         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4668         const uint8_t *scan, *scan8x8, *dc_scan;
4669
4670 //        fill_non_zero_count_cache(h);
4671
4672         if(IS_INTERLACED(mb_type)){
4673             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4674             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4675             dc_scan= luma_dc_field_scan;
4676         }else{
4677             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4678             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4679             dc_scan= luma_dc_zigzag_scan;
4680         }
4681
4682         dquant= get_se_golomb(&s->gb);
4683
4684         if( dquant > 25 || dquant < -26 ){
4685             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4686             return -1;
4687         }
4688
4689         s->qscale += dquant;
4690         if(((unsigned)s->qscale) > 51){
4691             if(s->qscale<0) s->qscale+= 52;
4692             else            s->qscale-= 52;
4693         }
4694
4695         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4696         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4697         if(IS_INTRA16x16(mb_type)){
4698             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4699                 return -1; //FIXME continue if partitioned and other return -1 too
4700             }
4701
4702             assert((cbp&15) == 0 || (cbp&15) == 15);
4703
4704             if(cbp&15){
4705                 for(i8x8=0; i8x8<4; i8x8++){
4706                     for(i4x4=0; i4x4<4; i4x4++){
4707                         const int index= i4x4 + 4*i8x8;
4708                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4709                             return -1;
4710                         }
4711                     }
4712                 }
4713             }else{
4714                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4715             }
4716         }else{
4717             for(i8x8=0; i8x8<4; i8x8++){
4718                 if(cbp & (1<<i8x8)){
4719                     if(IS_8x8DCT(mb_type)){
4720                         DCTELEM *buf = &h->mb[64*i8x8];
4721                         uint8_t *nnz;
4722                         for(i4x4=0; i4x4<4; i4x4++){
4723                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4724                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4725                                 return -1;
4726                         }
4727                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4728                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4729                     }else{
4730                         for(i4x4=0; i4x4<4; i4x4++){
4731                             const int index= i4x4 + 4*i8x8;
4732
4733                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4734                                 return -1;
4735                             }
4736                         }
4737                     }
4738                 }else{
4739                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4740                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4741                 }
4742             }
4743         }
4744
4745         if(cbp&0x30){
4746             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4747                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4748                     return -1;
4749                 }
4750         }
4751
4752         if(cbp&0x20){
4753             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4754                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4755                 for(i4x4=0; i4x4<4; i4x4++){
4756                     const int index= 16 + 4*chroma_idx + i4x4;
4757                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4758                         return -1;
4759                     }
4760                 }
4761             }
4762         }else{
4763             uint8_t * const nnz= &h->non_zero_count_cache[0];
4764             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4765             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4766         }
4767     }else{
4768         uint8_t * const nnz= &h->non_zero_count_cache[0];
4769         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4770         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4771         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4772     }
4773     s->current_picture.qscale_table[mb_xy]= s->qscale;
4774     write_back_non_zero_count(h);
4775
4776     if(MB_MBAFF){
4777         h->ref_count[0] >>= 1;
4778         h->ref_count[1] >>= 1;
4779     }
4780
4781     return 0;
4782 }
4783
4784 static int decode_cabac_field_decoding_flag(H264Context *h) {
4785     MpegEncContext * const s = &h->s;
4786     const int mb_x = s->mb_x;
4787     const int mb_y = s->mb_y & ~1;
4788     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4789     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4790
4791     unsigned int ctx = 0;
4792
4793     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4794         ctx += 1;
4795     }
4796     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4797         ctx += 1;
4798     }
4799
4800     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4801 }
4802
4803 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4804     uint8_t *state= &h->cabac_state[ctx_base];
4805     int mb_type;
4806
4807     if(intra_slice){
4808         MpegEncContext * const s = &h->s;
4809         const int mba_xy = h->left_mb_xy[0];
4810         const int mbb_xy = h->top_mb_xy;
4811         int ctx=0;
4812         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4813             ctx++;
4814         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4815             ctx++;
4816         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4817             return 0;   /* I4x4 */
4818         state += 2;
4819     }else{
4820         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4821             return 0;   /* I4x4 */
4822     }
4823
4824     if( get_cabac_terminate( &h->cabac ) )
4825         return 25;  /* PCM */
4826
4827     mb_type = 1; /* I16x16 */
4828     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4829     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4830         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4831     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4832     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4833     return mb_type;
4834 }
4835
4836 static int decode_cabac_mb_type( H264Context *h ) {
4837     MpegEncContext * const s = &h->s;
4838
4839     if( h->slice_type_nos == FF_I_TYPE ) {
4840         return decode_cabac_intra_mb_type(h, 3, 1);
4841     } else if( h->slice_type_nos == FF_P_TYPE ) {
4842         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4843             /* P-type */
4844             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4845                 /* P_L0_D16x16, P_8x8 */
4846                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4847             } else {
4848                 /* P_L0_D8x16, P_L0_D16x8 */
4849                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4850             }
4851         } else {
4852             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4853         }
4854     } else if( h->slice_type_nos == FF_B_TYPE ) {
4855         const int mba_xy = h->left_mb_xy[0];
4856         const int mbb_xy = h->top_mb_xy;
4857         int ctx = 0;
4858         int bits;
4859
4860         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4861             ctx++;
4862         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4863             ctx++;
4864
4865         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4866             return 0; /* B_Direct_16x16 */
4867
4868         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4869             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4870         }
4871
4872         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4873         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4874         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4875         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4876         if( bits < 8 )
4877             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4878         else if( bits == 13 ) {
4879             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4880         } else if( bits == 14 )
4881             return 11; /* B_L1_L0_8x16 */
4882         else if( bits == 15 )
4883             return 22; /* B_8x8 */
4884
4885         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4886         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4887     } else {
4888         /* TODO SI/SP frames? */
4889         return -1;
4890     }
4891 }
4892
4893 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4894     MpegEncContext * const s = &h->s;
4895     int mba_xy, mbb_xy;
4896     int ctx = 0;
4897
4898     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4899         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4900         mba_xy = mb_xy - 1;
4901         if( (mb_y&1)
4902             && h->slice_table[mba_xy] == h->slice_num
4903             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4904             mba_xy += s->mb_stride;
4905         if( MB_FIELD ){
4906             mbb_xy = mb_xy - s->mb_stride;
4907             if( !(mb_y&1)
4908                 && h->slice_table[mbb_xy] == h->slice_num
4909                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4910                 mbb_xy -= s->mb_stride;
4911         }else
4912             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4913     }else{
4914         int mb_xy = h->mb_xy;
4915         mba_xy = mb_xy - 1;
4916         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4917     }
4918
4919     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4920         ctx++;
4921     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4922         ctx++;
4923
4924     if( h->slice_type_nos == FF_B_TYPE )
4925         ctx += 13;
4926     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4927 }
4928
4929 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4930     int mode = 0;
4931
4932     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4933         return pred_mode;
4934
4935     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4936     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4937     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4938
4939     if( mode >= pred_mode )
4940         return mode + 1;
4941     else
4942         return mode;
4943 }
4944
4945 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4946     const int mba_xy = h->left_mb_xy[0];
4947     const int mbb_xy = h->top_mb_xy;
4948
4949     int ctx = 0;
4950
4951     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4952     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4953         ctx++;
4954
4955     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4956         ctx++;
4957
4958     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4959         return 0;
4960
4961     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4962         return 1;
4963     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4964         return 2;
4965     else
4966         return 3;
4967 }
4968
4969 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4970     int cbp_b, cbp_a, ctx, cbp = 0;
4971
4972     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4973     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4974
4975     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4976     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4977     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4978     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4979     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4980     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4981     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4982     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4983     return cbp;
4984 }
4985 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4986     int ctx;
4987     int cbp_a, cbp_b;
4988
4989     cbp_a = (h->left_cbp>>4)&0x03;
4990     cbp_b = (h-> top_cbp>>4)&0x03;
4991
4992     ctx = 0;
4993     if( cbp_a > 0 ) ctx++;
4994     if( cbp_b > 0 ) ctx += 2;
4995     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4996         return 0;
4997
4998     ctx = 4;
4999     if( cbp_a == 2 ) ctx++;
5000     if( cbp_b == 2 ) ctx += 2;
5001     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5002 }
5003 static int decode_cabac_mb_dqp( H264Context *h) {
5004     int   ctx = 0;
5005     int   val = 0;
5006
5007     if( h->last_qscale_diff != 0 )
5008         ctx++;
5009
5010     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5011         if( ctx < 2 )
5012             ctx = 2;
5013         else
5014             ctx = 3;
5015         val++;
5016         if(val > 102) //prevent infinite loop
5017             return INT_MIN;
5018     }
5019
5020     if( val&0x01 )
5021         return (val + 1)/2;
5022     else
5023         return -(val + 1)/2;
5024 }
5025 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5026     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5027         return 0;   /* 8x8 */
5028     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5029         return 1;   /* 8x4 */
5030     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5031         return 2;   /* 4x8 */
5032     return 3;       /* 4x4 */
5033 }
5034 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5035     int type;
5036     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5037         return 0;   /* B_Direct_8x8 */
5038     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5039         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5040     type = 3;
5041     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5042         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5043             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5044         type += 4;
5045     }
5046     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5047     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5048     return type;
5049 }
5050
5051 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5052     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5053 }
5054
5055 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5056     int refa = h->ref_cache[list][scan8[n] - 1];
5057     int refb = h->ref_cache[list][scan8[n] - 8];
5058     int ref  = 0;
5059     int ctx  = 0;
5060
5061     if( h->slice_type_nos == FF_B_TYPE) {
5062         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5063             ctx++;
5064         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5065             ctx += 2;
5066     } else {
5067         if( refa > 0 )
5068             ctx++;
5069         if( refb > 0 )
5070             ctx += 2;
5071     }
5072
5073     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5074         ref++;
5075         if( ctx < 4 )
5076             ctx = 4;
5077         else
5078             ctx = 5;
5079         if(ref >= 32 /*h->ref_list[list]*/){
5080             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5081             return 0; //FIXME we should return -1 and check the return everywhere
5082         }
5083     }
5084     return ref;
5085 }
5086
5087 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5088     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5089                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5090     int ctxbase = (l == 0) ? 40 : 47;
5091     int ctx, mvd;
5092
5093     if( amvd < 3 )
5094         ctx = 0;
5095     else if( amvd > 32 )
5096         ctx = 2;
5097     else
5098         ctx = 1;
5099
5100     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5101         return 0;
5102
5103     mvd= 1;
5104     ctx= 3;
5105     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5106         mvd++;
5107         if( ctx < 6 )
5108             ctx++;
5109     }
5110
5111     if( mvd >= 9 ) {
5112         int k = 3;
5113         while( get_cabac_bypass( &h->cabac ) ) {
5114             mvd += 1 << k;
5115             k++;
5116             if(k>24){
5117                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5118                 return INT_MIN;
5119             }
5120         }
5121         while( k-- ) {
5122             if( get_cabac_bypass( &h->cabac ) )
5123                 mvd += 1 << k;
5124         }
5125     }
5126     return get_cabac_bypass_sign( &h->cabac, -mvd );
5127 }
5128
5129 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5130     int nza, nzb;
5131     int ctx = 0;
5132
5133     if( is_dc ) {
5134         if( cat == 0 ) {
5135             nza = h->left_cbp&0x100;
5136             nzb = h-> top_cbp&0x100;
5137         } else {
5138             nza = (h->left_cbp>>(6+idx))&0x01;
5139             nzb = (h-> top_cbp>>(6+idx))&0x01;
5140         }
5141     } else {
5142         if( cat == 4 ) {
5143             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5144             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5145         } else {
5146             assert(cat == 1 || cat == 2);
5147             nza = h->non_zero_count_cache[scan8[idx] - 1];
5148             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5149         }
5150     }
5151
5152     if( nza > 0 )
5153         ctx++;
5154
5155     if( nzb > 0 )
5156         ctx += 2;
5157
5158     return ctx + 4 * cat;
5159 }
5160
5161 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5162     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5163     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5164     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5165     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5166 };
5167
5168 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5169     static const int significant_coeff_flag_offset[2][6] = {
5170       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5171       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5172     };
5173     static const int last_coeff_flag_offset[2][6] = {
5174       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5175       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5176     };
5177     static const int coeff_abs_level_m1_offset[6] = {
5178         227+0, 227+10, 227+20, 227+30, 227+39, 426
5179     };
5180     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5181       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5182         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5183         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5184        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5185       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5186         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5187         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5188         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5189     };
5190     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5191      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5192      * map node ctx => cabac ctx for level=1 */
5193     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5194     /* map node ctx => cabac ctx for level>1 */
5195     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5196     static const uint8_t coeff_abs_level_transition[2][8] = {
5197     /* update node ctx after decoding a level=1 */
5198         { 1, 2, 3, 3, 4, 5, 6, 7 },
5199     /* update node ctx after decoding a level>1 */
5200         { 4, 4, 4, 4, 5, 6, 7, 7 }
5201     };
5202
5203     int index[64];
5204
5205     int av_unused last;
5206     int coeff_count = 0;
5207     int node_ctx = 0;
5208
5209     uint8_t *significant_coeff_ctx_base;
5210     uint8_t *last_coeff_ctx_base;
5211     uint8_t *abs_level_m1_ctx_base;
5212
5213 #ifndef ARCH_X86
5214 #define CABAC_ON_STACK
5215 #endif
5216 #ifdef CABAC_ON_STACK
5217 #define CC &cc
5218     CABACContext cc;
5219     cc.range     = h->cabac.range;
5220     cc.low       = h->cabac.low;
5221     cc.bytestream= h->cabac.bytestream;
5222 #else
5223 #define CC &h->cabac
5224 #endif
5225
5226
5227     /* cat: 0-> DC 16x16  n = 0
5228      *      1-> AC 16x16  n = luma4x4idx
5229      *      2-> Luma4x4   n = luma4x4idx
5230      *      3-> DC Chroma n = iCbCr
5231      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5232      *      5-> Luma8x8   n = 4 * luma8x8idx
5233      */
5234
5235     /* read coded block flag */
5236     if( is_dc || cat != 5 ) {
5237         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5238             if( !is_dc ) {
5239                 if( cat == 4 )
5240                     h->non_zero_count_cache[scan8[16+n]] = 0;
5241                 else
5242                     h->non_zero_count_cache[scan8[n]] = 0;
5243             }
5244
5245 #ifdef CABAC_ON_STACK
5246             h->cabac.range     = cc.range     ;
5247             h->cabac.low       = cc.low       ;
5248             h->cabac.bytestream= cc.bytestream;
5249 #endif
5250             return;
5251         }
5252     }
5253
5254     significant_coeff_ctx_base = h->cabac_state
5255         + significant_coeff_flag_offset[MB_FIELD][cat];
5256     last_coeff_ctx_base = h->cabac_state
5257         + last_coeff_flag_offset[MB_FIELD][cat];
5258     abs_level_m1_ctx_base = h->cabac_state
5259         + coeff_abs_level_m1_offset[cat];
5260
5261     if( !is_dc && cat == 5 ) {
5262 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5263         for(last= 0; last < coefs; last++) { \
5264             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5265             if( get_cabac( CC, sig_ctx )) { \
5266                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5267                 index[coeff_count++] = last; \
5268                 if( get_cabac( CC, last_ctx ) ) { \
5269                     last= max_coeff; \
5270                     break; \
5271                 } \
5272             } \
5273         }\
5274         if( last == max_coeff -1 ) {\
5275             index[coeff_count++] = last;\
5276         }
5277         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5278 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5279         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5280     } else {
5281         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5282 #else
5283         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5284     } else {
5285         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5286 #endif
5287     }
5288     assert(coeff_count > 0);
5289
5290     if( is_dc ) {
5291         if( cat == 0 )
5292             h->cbp_table[h->mb_xy] |= 0x100;
5293         else
5294             h->cbp_table[h->mb_xy] |= 0x40 << n;
5295     } else {
5296         if( cat == 5 )
5297             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5298         else if( cat == 4 )
5299             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5300         else {
5301             assert( cat == 1 || cat == 2 );
5302             h->non_zero_count_cache[scan8[n]] = coeff_count;
5303         }
5304     }
5305
5306     do {
5307         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5308
5309         int j= scantable[index[--coeff_count]];
5310
5311         if( get_cabac( CC, ctx ) == 0 ) {
5312             node_ctx = coeff_abs_level_transition[0][node_ctx];
5313             if( is_dc ) {
5314                 block[j] = get_cabac_bypass_sign( CC, -1);
5315             }else{
5316                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5317             }
5318         } else {
5319             int coeff_abs = 2;
5320             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5321             node_ctx = coeff_abs_level_transition[1][node_ctx];
5322
5323             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5324                 coeff_abs++;
5325             }
5326
5327             if( coeff_abs >= 15 ) {
5328                 int j = 0;
5329                 while( get_cabac_bypass( CC ) ) {
5330                     j++;
5331                 }
5332
5333                 coeff_abs=1;
5334                 while( j-- ) {
5335                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5336                 }
5337                 coeff_abs+= 14;
5338             }
5339
5340             if( is_dc ) {
5341                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5342             }else{
5343                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5344             }
5345         }
5346     } while( coeff_count );
5347 #ifdef CABAC_ON_STACK
5348             h->cabac.range     = cc.range     ;
5349             h->cabac.low       = cc.low       ;
5350             h->cabac.bytestream= cc.bytestream;
5351 #endif
5352
5353 }
5354
5355 #ifndef CONFIG_SMALL
5356 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5357     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5358 }
5359
5360 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5361     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5362 }
5363 #endif
5364
5365 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5366 #ifdef CONFIG_SMALL
5367     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5368 #else
5369     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5370     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5371 #endif
5372 }
5373
5374 static inline void compute_mb_neighbors(H264Context *h)
5375 {
5376     MpegEncContext * const s = &h->s;
5377     const int mb_xy  = h->mb_xy;
5378     h->top_mb_xy     = mb_xy - s->mb_stride;
5379     h->left_mb_xy[0] = mb_xy - 1;
5380     if(FRAME_MBAFF){
5381         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5382         const int top_pair_xy      = pair_xy     - s->mb_stride;
5383         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5384         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5385         const int curr_mb_frame_flag = !MB_FIELD;
5386         const int bottom = (s->mb_y & 1);
5387         if (bottom
5388                 ? !curr_mb_frame_flag // bottom macroblock
5389                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5390                 ) {
5391             h->top_mb_xy -= s->mb_stride;
5392         }
5393         if (left_mb_frame_flag != curr_mb_frame_flag) {
5394             h->left_mb_xy[0] = pair_xy - 1;
5395         }
5396     } else if (FIELD_PICTURE) {
5397         h->top_mb_xy -= s->mb_stride;
5398     }
5399     return;
5400 }
5401
5402 /**
5403  * decodes a macroblock
5404  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5405  */
5406 static int decode_mb_cabac(H264Context *h) {
5407     MpegEncContext * const s = &h->s;
5408     int mb_xy;
5409     int mb_type, partition_count, cbp = 0;
5410     int dct8x8_allowed= h->pps.transform_8x8_mode;
5411
5412     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5413
5414     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5415
5416     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5417     if( h->slice_type_nos != FF_I_TYPE ) {
5418         int skip;
5419         /* a skipped mb needs the aff flag from the following mb */
5420         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5421             predict_field_decoding_flag(h);
5422         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5423             skip = h->next_mb_skipped;
5424         else
5425             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5426         /* read skip flags */
5427         if( skip ) {
5428             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5429                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5430                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5431                 if(h->next_mb_skipped)
5432                     predict_field_decoding_flag(h);
5433                 else
5434                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5435             }
5436
5437             decode_mb_skip(h);
5438
5439             h->cbp_table[mb_xy] = 0;
5440             h->chroma_pred_mode_table[mb_xy] = 0;
5441             h->last_qscale_diff = 0;
5442
5443             return 0;
5444
5445         }
5446     }
5447     if(FRAME_MBAFF){
5448         if( (s->mb_y&1) == 0 )
5449             h->mb_mbaff =
5450             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5451     }
5452
5453     h->prev_mb_skipped = 0;
5454
5455     compute_mb_neighbors(h);
5456     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5457         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5458         return -1;
5459     }
5460
5461     if( h->slice_type_nos == FF_B_TYPE ) {
5462         if( mb_type < 23 ){
5463             partition_count= b_mb_type_info[mb_type].partition_count;
5464             mb_type=         b_mb_type_info[mb_type].type;
5465         }else{
5466             mb_type -= 23;
5467             goto decode_intra_mb;
5468         }
5469     } else if( h->slice_type_nos == FF_P_TYPE ) {
5470         if( mb_type < 5) {
5471             partition_count= p_mb_type_info[mb_type].partition_count;
5472             mb_type=         p_mb_type_info[mb_type].type;
5473         } else {
5474             mb_type -= 5;
5475             goto decode_intra_mb;
5476         }
5477     } else {
5478         if(h->slice_type == FF_SI_TYPE && mb_type)
5479             mb_type--;
5480         assert(h->slice_type_nos == FF_I_TYPE);
5481 decode_intra_mb:
5482         partition_count = 0;
5483         cbp= i_mb_type_info[mb_type].cbp;
5484         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5485         mb_type= i_mb_type_info[mb_type].type;
5486     }
5487     if(MB_FIELD)
5488         mb_type |= MB_TYPE_INTERLACED;
5489
5490     h->slice_table[ mb_xy ]= h->slice_num;
5491
5492     if(IS_INTRA_PCM(mb_type)) {
5493         const uint8_t *ptr;
5494
5495         // We assume these blocks are very rare so we do not optimize it.
5496         // FIXME The two following lines get the bitstream position in the cabac
5497         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5498         ptr= h->cabac.bytestream;
5499         if(h->cabac.low&0x1) ptr--;
5500         if(CABAC_BITS==16){
5501             if(h->cabac.low&0x1FF) ptr--;
5502         }
5503
5504         // The pixels are stored in the same order as levels in h->mb array.
5505         memcpy(h->mb, ptr, 256); ptr+=256;
5506         if(CHROMA){
5507             memcpy(h->mb+128, ptr, 128); ptr+=128;
5508         }
5509
5510         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5511
5512         // All blocks are present
5513         h->cbp_table[mb_xy] = 0x1ef;
5514         h->chroma_pred_mode_table[mb_xy] = 0;
5515         // In deblocking, the quantizer is 0
5516         s->current_picture.qscale_table[mb_xy]= 0;
5517         // All coeffs are present
5518         memset(h->non_zero_count[mb_xy], 16, 16);
5519         s->current_picture.mb_type[mb_xy]= mb_type;
5520         h->last_qscale_diff = 0;
5521         return 0;
5522     }
5523
5524     if(MB_MBAFF){
5525         h->ref_count[0] <<= 1;
5526         h->ref_count[1] <<= 1;
5527     }
5528
5529     fill_caches(h, mb_type, 0);
5530
5531     if( IS_INTRA( mb_type ) ) {
5532         int i, pred_mode;
5533         if( IS_INTRA4x4( mb_type ) ) {
5534             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5535                 mb_type |= MB_TYPE_8x8DCT;
5536                 for( i = 0; i < 16; i+=4 ) {
5537                     int pred = pred_intra_mode( h, i );
5538                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5539                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5540                 }
5541             } else {
5542                 for( i = 0; i < 16; i++ ) {
5543                     int pred = pred_intra_mode( h, i );
5544                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5545
5546                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5547                 }
5548             }
5549             write_back_intra_pred_mode(h);
5550             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5551         } else {
5552             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5553             if( h->intra16x16_pred_mode < 0 ) return -1;
5554         }
5555         if(CHROMA){
5556             h->chroma_pred_mode_table[mb_xy] =
5557             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5558
5559             pred_mode= check_intra_pred_mode( h, pred_mode );
5560             if( pred_mode < 0 ) return -1;
5561             h->chroma_pred_mode= pred_mode;
5562         }
5563     } else if( partition_count == 4 ) {
5564         int i, j, sub_partition_count[4], list, ref[2][4];
5565
5566         if( h->slice_type_nos == FF_B_TYPE ) {
5567             for( i = 0; i < 4; i++ ) {
5568                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5569                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5570                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5571             }
5572             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5573                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5574                 pred_direct_motion(h, &mb_type);
5575                 h->ref_cache[0][scan8[4]] =
5576                 h->ref_cache[1][scan8[4]] =
5577                 h->ref_cache[0][scan8[12]] =
5578                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5579                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5580                     for( i = 0; i < 4; i++ )
5581                         if( IS_DIRECT(h->sub_mb_type[i]) )
5582                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5583                 }
5584             }
5585         } else {
5586             for( i = 0; i < 4; i++ ) {
5587                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5588                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5589                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5590             }
5591         }
5592
5593         for( list = 0; list < h->list_count; list++ ) {
5594                 for( i = 0; i < 4; i++ ) {
5595                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5596                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5597                         if( h->ref_count[list] > 1 )
5598                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5599                         else
5600                             ref[list][i] = 0;
5601                     } else {
5602                         ref[list][i] = -1;
5603                     }
5604                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5605                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5606                 }
5607         }
5608
5609         if(dct8x8_allowed)
5610             dct8x8_allowed = get_dct8x8_allowed(h);
5611
5612         for(list=0; list<h->list_count; list++){
5613             for(i=0; i<4; i++){
5614                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5615                 if(IS_DIRECT(h->sub_mb_type[i])){
5616                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5617                     continue;
5618                 }
5619
5620                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5621                     const int sub_mb_type= h->sub_mb_type[i];
5622                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5623                     for(j=0; j<sub_partition_count[i]; j++){
5624                         int mpx, mpy;
5625                         int mx, my;
5626                         const int index= 4*i + block_width*j;
5627                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5628                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5629                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5630
5631                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5632                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5633                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5634
5635                         if(IS_SUB_8X8(sub_mb_type)){
5636                             mv_cache[ 1 ][0]=
5637                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5638                             mv_cache[ 1 ][1]=
5639                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5640
5641                             mvd_cache[ 1 ][0]=
5642                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5643                             mvd_cache[ 1 ][1]=
5644                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5645                         }else if(IS_SUB_8X4(sub_mb_type)){
5646                             mv_cache[ 1 ][0]= mx;
5647                             mv_cache[ 1 ][1]= my;
5648
5649                             mvd_cache[ 1 ][0]= mx - mpx;
5650                             mvd_cache[ 1 ][1]= my - mpy;
5651                         }else if(IS_SUB_4X8(sub_mb_type)){
5652                             mv_cache[ 8 ][0]= mx;
5653                             mv_cache[ 8 ][1]= my;
5654
5655                             mvd_cache[ 8 ][0]= mx - mpx;
5656                             mvd_cache[ 8 ][1]= my - mpy;
5657                         }
5658                         mv_cache[ 0 ][0]= mx;
5659                         mv_cache[ 0 ][1]= my;
5660
5661                         mvd_cache[ 0 ][0]= mx - mpx;
5662                         mvd_cache[ 0 ][1]= my - mpy;
5663                     }
5664                 }else{
5665                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5666                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5667                     p[0] = p[1] = p[8] = p[9] = 0;
5668                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5669                 }
5670             }
5671         }
5672     } else if( IS_DIRECT(mb_type) ) {
5673         pred_direct_motion(h, &mb_type);
5674         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5675         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5676         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5677     } else {
5678         int list, mx, my, i, mpx, mpy;
5679         if(IS_16X16(mb_type)){
5680             for(list=0; list<h->list_count; list++){
5681                 if(IS_DIR(mb_type, 0, list)){
5682                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5683                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5684                 }else
5685                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5686             }
5687             for(list=0; list<h->list_count; list++){
5688                 if(IS_DIR(mb_type, 0, list)){
5689                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5690
5691                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5692                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5693                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5694
5695                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5696                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5697                 }else
5698                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5699             }
5700         }
5701         else if(IS_16X8(mb_type)){
5702             for(list=0; list<h->list_count; list++){
5703                     for(i=0; i<2; i++){
5704                         if(IS_DIR(mb_type, i, list)){
5705                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5706                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5707                         }else
5708                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5709                     }
5710             }
5711             for(list=0; list<h->list_count; list++){
5712                 for(i=0; i<2; i++){
5713                     if(IS_DIR(mb_type, i, list)){
5714                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5715                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5716                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5717                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5718
5719                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5720                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5721                     }else{
5722                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5723                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5724                     }
5725                 }
5726             }
5727         }else{
5728             assert(IS_8X16(mb_type));
5729             for(list=0; list<h->list_count; list++){
5730                     for(i=0; i<2; i++){
5731                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5732                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5733                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5734                         }else
5735                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5736                     }
5737             }
5738             for(list=0; list<h->list_count; list++){
5739                 for(i=0; i<2; i++){
5740                     if(IS_DIR(mb_type, i, list)){
5741                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5742                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5743                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5744
5745                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5746                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5747                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5748                     }else{
5749                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5750                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5751                     }
5752                 }
5753             }
5754         }
5755     }
5756
5757    if( IS_INTER( mb_type ) ) {
5758         h->chroma_pred_mode_table[mb_xy] = 0;
5759         write_back_motion( h, mb_type );
5760    }
5761
5762     if( !IS_INTRA16x16( mb_type ) ) {
5763         cbp  = decode_cabac_mb_cbp_luma( h );
5764         if(CHROMA)
5765             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5766     }
5767
5768     h->cbp_table[mb_xy] = h->cbp = cbp;
5769
5770     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5771         if( decode_cabac_mb_transform_size( h ) )
5772             mb_type |= MB_TYPE_8x8DCT;
5773     }
5774     s->current_picture.mb_type[mb_xy]= mb_type;
5775
5776     if( cbp || IS_INTRA16x16( mb_type ) ) {
5777         const uint8_t *scan, *scan8x8, *dc_scan;
5778         const uint32_t *qmul;
5779         int dqp;
5780
5781         if(IS_INTERLACED(mb_type)){
5782             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5783             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5784             dc_scan= luma_dc_field_scan;
5785         }else{
5786             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5787             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5788             dc_scan= luma_dc_zigzag_scan;
5789         }
5790
5791         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5792         if( dqp == INT_MIN ){
5793             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5794             return -1;
5795         }
5796         s->qscale += dqp;
5797         if(((unsigned)s->qscale) > 51){
5798             if(s->qscale<0) s->qscale+= 52;
5799             else            s->qscale-= 52;
5800         }
5801         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5802         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5803
5804         if( IS_INTRA16x16( mb_type ) ) {
5805             int i;
5806             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5807             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5808
5809             if( cbp&15 ) {
5810                 qmul = h->dequant4_coeff[0][s->qscale];
5811                 for( i = 0; i < 16; i++ ) {
5812                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5813                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5814                 }
5815             } else {
5816                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5817             }
5818         } else {
5819             int i8x8, i4x4;
5820             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5821                 if( cbp & (1<<i8x8) ) {
5822                     if( IS_8x8DCT(mb_type) ) {
5823                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5824                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5825                     } else {
5826                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5827                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5828                             const int index = 4*i8x8 + i4x4;
5829                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5830 //START_TIMER
5831                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5832 //STOP_TIMER("decode_residual")
5833                         }
5834                     }
5835                 } else {
5836                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5837                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5838                 }
5839             }
5840         }
5841
5842         if( cbp&0x30 ){
5843             int c;
5844             for( c = 0; c < 2; c++ ) {
5845                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5846                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5847             }
5848         }
5849
5850         if( cbp&0x20 ) {
5851             int c, i;
5852             for( c = 0; c < 2; c++ ) {
5853                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5854                 for( i = 0; i < 4; i++ ) {
5855                     const int index = 16 + 4 * c + i;
5856                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5857                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5858                 }
5859             }
5860         } else {
5861             uint8_t * const nnz= &h->non_zero_count_cache[0];
5862             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5863             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5864         }
5865     } else {
5866         uint8_t * const nnz= &h->non_zero_count_cache[0];
5867         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5868         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5869         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5870         h->last_qscale_diff = 0;
5871     }
5872
5873     s->current_picture.qscale_table[mb_xy]= s->qscale;
5874     write_back_non_zero_count(h);
5875
5876     if(MB_MBAFF){
5877         h->ref_count[0] >>= 1;
5878         h->ref_count[1] >>= 1;
5879     }
5880
5881     return 0;
5882 }
5883
5884
5885 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5886     int i, d;
5887     const int index_a = qp + h->slice_alpha_c0_offset;
5888     const int alpha = (alpha_table+52)[index_a];
5889     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5890
5891     if( bS[0] < 4 ) {
5892         int8_t tc[4];
5893         for(i=0; i<4; i++)
5894             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5895         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5896     } else {
5897         /* 16px edge length, because bS=4 is triggered by being at
5898          * the edge of an intra MB, so all 4 bS are the same */
5899             for( d = 0; d < 16; d++ ) {
5900                 const int p0 = pix[-1];
5901                 const int p1 = pix[-2];
5902                 const int p2 = pix[-3];
5903
5904                 const int q0 = pix[0];
5905                 const int q1 = pix[1];
5906                 const int q2 = pix[2];
5907
5908                 if( FFABS( p0 - q0 ) < alpha &&
5909                     FFABS( p1 - p0 ) < beta &&
5910                     FFABS( q1 - q0 ) < beta ) {
5911
5912                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5913                         if( FFABS( p2 - p0 ) < beta)
5914                         {
5915                             const int p3 = pix[-4];
5916                             /* p0', p1', p2' */
5917                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5918                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5919                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5920                         } else {
5921                             /* p0' */
5922                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5923                         }
5924                         if( FFABS( q2 - q0 ) < beta)
5925                         {
5926                             const int q3 = pix[3];
5927                             /* q0', q1', q2' */
5928                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5929                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5930                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5931                         } else {
5932                             /* q0' */
5933                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5934                         }
5935                     }else{
5936                         /* p0', q0' */
5937                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5938                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5939                     }
5940                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5941                 }
5942                 pix += stride;
5943             }
5944     }
5945 }
5946 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5947     int i;
5948     const int index_a = qp + h->slice_alpha_c0_offset;
5949     const int alpha = (alpha_table+52)[index_a];
5950     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5951
5952     if( bS[0] < 4 ) {
5953         int8_t tc[4];
5954         for(i=0; i<4; i++)
5955             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5956         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5957     } else {
5958         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5959     }
5960 }
5961
5962 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5963     int i;
5964     for( i = 0; i < 16; i++, pix += stride) {
5965         int index_a;
5966         int alpha;
5967         int beta;
5968
5969         int qp_index;
5970         int bS_index = (i >> 1);
5971         if (!MB_FIELD) {
5972             bS_index &= ~1;
5973             bS_index |= (i & 1);
5974         }
5975
5976         if( bS[bS_index] == 0 ) {
5977             continue;
5978         }
5979
5980         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5981         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5982         alpha = (alpha_table+52)[index_a];
5983         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5984
5985         if( bS[bS_index] < 4 ) {
5986             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5987             const int p0 = pix[-1];
5988             const int p1 = pix[-2];
5989             const int p2 = pix[-3];
5990             const int q0 = pix[0];
5991             const int q1 = pix[1];
5992             const int q2 = pix[2];
5993
5994             if( FFABS( p0 - q0 ) < alpha &&
5995                 FFABS( p1 - p0 ) < beta &&
5996                 FFABS( q1 - q0 ) < beta ) {
5997                 int tc = tc0;
5998                 int i_delta;
5999
6000                 if( FFABS( p2 - p0 ) < beta ) {
6001                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6002                     tc++;
6003                 }
6004                 if( FFABS( q2 - q0 ) < beta ) {
6005                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6006                     tc++;
6007                 }
6008
6009                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6010                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6011                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6012                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6013             }
6014         }else{
6015             const int p0 = pix[-1];
6016             const int p1 = pix[-2];
6017             const int p2 = pix[-3];
6018
6019             const int q0 = pix[0];
6020             const int q1 = pix[1];
6021             const int q2 = pix[2];
6022
6023             if( FFABS( p0 - q0 ) < alpha &&
6024                 FFABS( p1 - p0 ) < beta &&
6025                 FFABS( q1 - q0 ) < beta ) {
6026
6027                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6028                     if( FFABS( p2 - p0 ) < beta)
6029                     {
6030                         const int p3 = pix[-4];
6031                         /* p0', p1', p2' */
6032                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6033                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6034                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6035                     } else {
6036                         /* p0' */
6037                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6038                     }
6039                     if( FFABS( q2 - q0 ) < beta)
6040                     {
6041                         const int q3 = pix[3];
6042                         /* q0', q1', q2' */
6043                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6044                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6045                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6046                     } else {
6047                         /* q0' */
6048                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6049                     }
6050                 }else{
6051                     /* p0', q0' */
6052                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6053                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6054                 }
6055                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6056             }
6057         }
6058     }
6059 }
6060 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6061     int i;
6062     for( i = 0; i < 8; i++, pix += stride) {
6063         int index_a;
6064         int alpha;
6065         int beta;
6066
6067         int qp_index;
6068         int bS_index = i;
6069
6070         if( bS[bS_index] == 0 ) {
6071             continue;
6072         }
6073
6074         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6075         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6076         alpha = (alpha_table+52)[index_a];
6077         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6078
6079         if( bS[bS_index] < 4 ) {
6080             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6081             const int p0 = pix[-1];
6082             const int p1 = pix[-2];
6083             const int q0 = pix[0];
6084             const int q1 = pix[1];
6085
6086             if( FFABS( p0 - q0 ) < alpha &&
6087                 FFABS( p1 - p0 ) < beta &&
6088                 FFABS( q1 - q0 ) < beta ) {
6089                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6090
6091                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6092                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6093                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6094             }
6095         }else{
6096             const int p0 = pix[-1];
6097             const int p1 = pix[-2];
6098             const int q0 = pix[0];
6099             const int q1 = pix[1];
6100
6101             if( FFABS( p0 - q0 ) < alpha &&
6102                 FFABS( p1 - p0 ) < beta &&
6103                 FFABS( q1 - q0 ) < beta ) {
6104
6105                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6106                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6107                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6108             }
6109         }
6110     }
6111 }
6112
6113 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6114     int i, d;
6115     const int index_a = qp + h->slice_alpha_c0_offset;
6116     const int alpha = (alpha_table+52)[index_a];
6117     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6118     const int pix_next  = stride;
6119
6120     if( bS[0] < 4 ) {
6121         int8_t tc[4];
6122         for(i=0; i<4; i++)
6123             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6124         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6125     } else {
6126         /* 16px edge length, see filter_mb_edgev */
6127             for( d = 0; d < 16; d++ ) {
6128                 const int p0 = pix[-1*pix_next];
6129                 const int p1 = pix[-2*pix_next];
6130                 const int p2 = pix[-3*pix_next];
6131                 const int q0 = pix[0];
6132                 const int q1 = pix[1*pix_next];
6133                 const int q2 = pix[2*pix_next];
6134
6135                 if( FFABS( p0 - q0 ) < alpha &&
6136                     FFABS( p1 - p0 ) < beta &&
6137                     FFABS( q1 - q0 ) < beta ) {
6138
6139                     const int p3 = pix[-4*pix_next];
6140                     const int q3 = pix[ 3*pix_next];
6141
6142                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6143                         if( FFABS( p2 - p0 ) < beta) {
6144                             /* p0', p1', p2' */
6145                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6146                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6147                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6148                         } else {
6149                             /* p0' */
6150                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6151                         }
6152                         if( FFABS( q2 - q0 ) < beta) {
6153                             /* q0', q1', q2' */
6154                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6155                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6156                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6157                         } else {
6158                             /* q0' */
6159                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6160                         }
6161                     }else{
6162                         /* p0', q0' */
6163                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6164                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6165                     }
6166                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6167                 }
6168                 pix++;
6169             }
6170     }
6171 }
6172
6173 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6174     int i;
6175     const int index_a = qp + h->slice_alpha_c0_offset;
6176     const int alpha = (alpha_table+52)[index_a];
6177     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6178
6179     if( bS[0] < 4 ) {
6180         int8_t tc[4];
6181         for(i=0; i<4; i++)
6182             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6183         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6184     } else {
6185         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6186     }
6187 }
6188
6189 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6190     MpegEncContext * const s = &h->s;
6191     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6192     int mb_xy, mb_type;
6193     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6194
6195     mb_xy = h->mb_xy;
6196
6197     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6198 1 ||
6199        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6200                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6201         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6202         return;
6203     }
6204     assert(!FRAME_MBAFF);
6205
6206     mb_type = s->current_picture.mb_type[mb_xy];
6207     qp = s->current_picture.qscale_table[mb_xy];
6208     qp0 = s->current_picture.qscale_table[mb_xy-1];
6209     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6210     qpc = get_chroma_qp( h, 0, qp );
6211     qpc0 = get_chroma_qp( h, 0, qp0 );
6212     qpc1 = get_chroma_qp( h, 0, qp1 );
6213     qp0 = (qp + qp0 + 1) >> 1;
6214     qp1 = (qp + qp1 + 1) >> 1;
6215     qpc0 = (qpc + qpc0 + 1) >> 1;
6216     qpc1 = (qpc + qpc1 + 1) >> 1;
6217     qp_thresh = 15 - h->slice_alpha_c0_offset;
6218     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6219        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6220         return;
6221
6222     if( IS_INTRA(mb_type) ) {
6223         int16_t bS4[4] = {4,4,4,4};
6224         int16_t bS3[4] = {3,3,3,3};
6225         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6226         if( IS_8x8DCT(mb_type) ) {
6227             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6228             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6229             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6230             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6231         } else {
6232             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6233             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6234             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6235             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6236             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6237             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6238             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6239             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6240         }
6241         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6242         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6243         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6244         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6245         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6246         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6247         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6248         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6249         return;
6250     } else {
6251         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6252         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6253         int edges;
6254         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6255             edges = 4;
6256             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6257         } else {
6258             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6259                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6260             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6261                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6262                              ? 3 : 0;
6263             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6264             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6265             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6266                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6267         }
6268         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6269             bSv[0][0] = 0x0004000400040004ULL;
6270         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6271             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6272
6273 #define FILTER(hv,dir,edge)\
6274         if(bSv[dir][edge]) {\
6275             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6276             if(!(edge&1)) {\
6277                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6278                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6279             }\
6280         }
6281         if( edges == 1 ) {
6282             FILTER(v,0,0);
6283             FILTER(h,1,0);
6284         } else if( IS_8x8DCT(mb_type) ) {
6285             FILTER(v,0,0);
6286             FILTER(v,0,2);
6287             FILTER(h,1,0);
6288             FILTER(h,1,2);
6289         } else {
6290             FILTER(v,0,0);
6291             FILTER(v,0,1);
6292             FILTER(v,0,2);
6293             FILTER(v,0,3);
6294             FILTER(h,1,0);
6295             FILTER(h,1,1);
6296             FILTER(h,1,2);
6297             FILTER(h,1,3);
6298         }
6299 #undef FILTER
6300     }
6301 }
6302
6303 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6304     MpegEncContext * const s = &h->s;
6305     const int mb_xy= mb_x + mb_y*s->mb_stride;
6306     const int mb_type = s->current_picture.mb_type[mb_xy];
6307     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6308     int first_vertical_edge_done = 0;
6309     int dir;
6310
6311     //for sufficiently low qp, filtering wouldn't do anything
6312     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6313     if(!FRAME_MBAFF){
6314         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6315         int qp = s->current_picture.qscale_table[mb_xy];
6316         if(qp <= qp_thresh
6317            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6318            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6319             return;
6320         }
6321     }
6322
6323     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6324     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6325         int top_type, left_type[2];
6326         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6327         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6328         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6329
6330         if(IS_8x8DCT(top_type)){
6331             h->non_zero_count_cache[4+8*0]=
6332             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6333             h->non_zero_count_cache[6+8*0]=
6334             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6335         }
6336         if(IS_8x8DCT(left_type[0])){
6337             h->non_zero_count_cache[3+8*1]=
6338             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6339         }
6340         if(IS_8x8DCT(left_type[1])){
6341             h->non_zero_count_cache[3+8*3]=
6342             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6343         }
6344
6345         if(IS_8x8DCT(mb_type)){
6346             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6347             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6348
6349             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6350             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6351
6352             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6353             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6354
6355             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6356             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6357         }
6358     }
6359
6360     if (FRAME_MBAFF
6361             // left mb is in picture
6362             && h->slice_table[mb_xy-1] != 255
6363             // and current and left pair do not have the same interlaced type
6364             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6365             // and left mb is in the same slice if deblocking_filter == 2
6366             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6367         /* First vertical edge is different in MBAFF frames
6368          * There are 8 different bS to compute and 2 different Qp
6369          */
6370         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6371         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6372         int16_t bS[8];
6373         int qp[2];
6374         int bqp[2];
6375         int rqp[2];
6376         int mb_qp, mbn0_qp, mbn1_qp;
6377         int i;
6378         first_vertical_edge_done = 1;
6379
6380         if( IS_INTRA(mb_type) )
6381             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6382         else {
6383             for( i = 0; i < 8; i++ ) {
6384                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6385
6386                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6387                     bS[i] = 4;
6388                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6389                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6390                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6391                                                                        :
6392                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6393                     bS[i] = 2;
6394                 else
6395                     bS[i] = 1;
6396             }
6397         }
6398
6399         mb_qp = s->current_picture.qscale_table[mb_xy];
6400         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6401         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6402         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6403         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6404                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6405         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6406                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6407         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6408         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6409                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6410         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6411                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6412
6413         /* Filter edge */
6414         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6415         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6416         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6417         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6418         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6419     }
6420     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6421     for( dir = 0; dir < 2; dir++ )
6422     {
6423         int edge;
6424         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6425         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6426         int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &15 ][0] + (MB_MBAFF ? 20 : 2);
6427         int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&15 ][0] + (MB_MBAFF ? 20 : 2);
6428         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6429
6430         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6431                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6432         // how often to recheck mv-based bS when iterating between edges
6433         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6434                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6435         // how often to recheck mv-based bS when iterating along each edge
6436         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6437
6438         if (first_vertical_edge_done) {
6439             start = 1;
6440             first_vertical_edge_done = 0;
6441         }
6442
6443         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6444             start = 1;
6445
6446         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6447             && !IS_INTERLACED(mb_type)
6448             && IS_INTERLACED(mbm_type)
6449             ) {
6450             // This is a special case in the norm where the filtering must
6451             // be done twice (one each of the field) even if we are in a
6452             // frame macroblock.
6453             //
6454             static const int nnz_idx[4] = {4,5,6,3};
6455             unsigned int tmp_linesize   = 2 *   linesize;
6456             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6457             int mbn_xy = mb_xy - 2 * s->mb_stride;
6458             int qp;
6459             int i, j;
6460             int16_t bS[4];
6461
6462             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6463                 if( IS_INTRA(mb_type) ||
6464                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6465                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6466                 } else {
6467                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6468                     for( i = 0; i < 4; i++ ) {
6469                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6470                             mbn_nnz[nnz_idx[i]] != 0 )
6471                             bS[i] = 2;
6472                         else
6473                             bS[i] = 1;
6474                     }
6475                 }
6476                 // Do not use s->qscale as luma quantizer because it has not the same
6477                 // value in IPCM macroblocks.
6478                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6479                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6480                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6481                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6482                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6483                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6484                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6485                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6486             }
6487
6488             start = 1;
6489         }
6490
6491         /* Calculate bS */
6492         for( edge = start; edge < edges; edge++ ) {
6493             /* mbn_xy: neighbor macroblock */
6494             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6495             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6496             int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6497             int16_t bS[4];
6498             int qp;
6499
6500             if( (edge&1) && IS_8x8DCT(mb_type) )
6501                 continue;
6502
6503             if( IS_INTRA(mb_type) ||
6504                 IS_INTRA(mbn_type) ) {
6505                 int value;
6506                 if (edge == 0) {
6507                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6508                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6509                     ) {
6510                         value = 4;
6511                     } else {
6512                         value = 3;
6513                     }
6514                 } else {
6515                     value = 3;
6516                 }
6517                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6518             } else {
6519                 int i, l;
6520                 int mv_done;
6521
6522                 if( edge & mask_edge ) {
6523                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6524                     mv_done = 1;
6525                 }
6526                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6527                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6528                     mv_done = 1;
6529                 }
6530                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6531                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6532                     int bn_idx= b_idx - (dir ? 8:1);
6533                     int v = 0;
6534
6535                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6536                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6537                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6538                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6539                     }
6540
6541                     if(h->slice_type_nos == FF_B_TYPE && v){
6542                         v=0;
6543                         for( l = 0; !v && l < 2; l++ ) {
6544                             int ln= 1-l;
6545                             v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6546                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6547                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6548                         }
6549                     }
6550
6551                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6552                     mv_done = 1;
6553                 }
6554                 else
6555                     mv_done = 0;
6556
6557                 for( i = 0; i < 4; i++ ) {
6558                     int x = dir == 0 ? edge : i;
6559                     int y = dir == 0 ? i    : edge;
6560                     int b_idx= 8 + 4 + x + 8*y;
6561                     int bn_idx= b_idx - (dir ? 8:1);
6562
6563                     if( h->non_zero_count_cache[b_idx] != 0 ||
6564                         h->non_zero_count_cache[bn_idx] != 0 ) {
6565                         bS[i] = 2;
6566                     }
6567                     else if(!mv_done)
6568                     {
6569                         bS[i] = 0;
6570                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6571                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6572                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6573                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6574                                 bS[i] = 1;
6575                                 break;
6576                             }
6577                         }
6578
6579                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6580                             bS[i] = 0;
6581                             for( l = 0; l < 2; l++ ) {
6582                                 int ln= 1-l;
6583                                 if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6584                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6585                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6586                                     bS[i] = 1;
6587                                     break;
6588                                 }
6589                             }
6590                         }
6591                     }
6592                 }
6593
6594                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6595                     continue;
6596             }
6597
6598             /* Filter edge */
6599             // Do not use s->qscale as luma quantizer because it has not the same
6600             // value in IPCM macroblocks.
6601             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6602             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6603             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6604             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6605             if( dir == 0 ) {
6606                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6607                 if( (edge&1) == 0 ) {
6608                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6609                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6610                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6611                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6612                 }
6613             } else {
6614                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6615                 if( (edge&1) == 0 ) {
6616                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6617                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6618                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6619                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6620                 }
6621             }
6622         }
6623     }
6624 }
6625
6626 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6627     MpegEncContext * const s = &h->s;
6628     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6629
6630     s->mb_skip_run= -1;
6631
6632     if( h->pps.cabac ) {
6633         int i;
6634
6635         /* realign */
6636         align_get_bits( &s->gb );
6637
6638         /* init cabac */
6639         ff_init_cabac_states( &h->cabac);
6640         ff_init_cabac_decoder( &h->cabac,
6641                                s->gb.buffer + get_bits_count(&s->gb)/8,
6642                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6643         /* calculate pre-state */
6644         for( i= 0; i < 460; i++ ) {
6645             int pre;
6646             if( h->slice_type_nos == FF_I_TYPE )
6647                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6648             else
6649                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6650
6651             if( pre <= 63 )
6652                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6653             else
6654                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6655         }
6656
6657         for(;;){
6658 //START_TIMER
6659             int ret = decode_mb_cabac(h);
6660             int eos;
6661 //STOP_TIMER("decode_mb_cabac")
6662
6663             if(ret>=0) hl_decode_mb(h);
6664
6665             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6666                 s->mb_y++;
6667
6668                 if(ret>=0) ret = decode_mb_cabac(h);
6669
6670                 if(ret>=0) hl_decode_mb(h);
6671                 s->mb_y--;
6672             }
6673             eos = get_cabac_terminate( &h->cabac );
6674
6675             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6676                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6677                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6678                 return -1;
6679             }
6680
6681             if( ++s->mb_x >= s->mb_width ) {
6682                 s->mb_x = 0;
6683                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6684                 ++s->mb_y;
6685                 if(FIELD_OR_MBAFF_PICTURE) {
6686                     ++s->mb_y;
6687                 }
6688             }
6689
6690             if( eos || s->mb_y >= s->mb_height ) {
6691                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6692                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6693                 return 0;
6694             }
6695         }
6696
6697     } else {
6698         for(;;){
6699             int ret = decode_mb_cavlc(h);
6700
6701             if(ret>=0) hl_decode_mb(h);
6702
6703             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6704                 s->mb_y++;
6705                 ret = decode_mb_cavlc(h);
6706
6707                 if(ret>=0) hl_decode_mb(h);
6708                 s->mb_y--;
6709             }
6710
6711             if(ret<0){
6712                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6713                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6714
6715                 return -1;
6716             }
6717
6718             if(++s->mb_x >= s->mb_width){
6719                 s->mb_x=0;
6720                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6721                 ++s->mb_y;
6722                 if(FIELD_OR_MBAFF_PICTURE) {
6723                     ++s->mb_y;
6724                 }
6725                 if(s->mb_y >= s->mb_height){
6726                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6727
6728                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6729                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6730
6731                         return 0;
6732                     }else{
6733                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6734
6735                         return -1;
6736                     }
6737                 }
6738             }
6739
6740             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6741                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6742                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6743                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6744
6745                     return 0;
6746                 }else{
6747                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6748
6749                     return -1;
6750                 }
6751             }
6752         }
6753     }
6754
6755 #if 0
6756     for(;s->mb_y < s->mb_height; s->mb_y++){
6757         for(;s->mb_x < s->mb_width; s->mb_x++){
6758             int ret= decode_mb(h);
6759
6760             hl_decode_mb(h);
6761
6762             if(ret<0){
6763                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6764                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6765
6766                 return -1;
6767             }
6768
6769             if(++s->mb_x >= s->mb_width){
6770                 s->mb_x=0;
6771                 if(++s->mb_y >= s->mb_height){
6772                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6773                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6774
6775                         return 0;
6776                     }else{
6777                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6778
6779                         return -1;
6780                     }
6781                 }
6782             }
6783
6784             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6785                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6786                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6787
6788                     return 0;
6789                 }else{
6790                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6791
6792                     return -1;
6793                 }
6794             }
6795         }
6796         s->mb_x=0;
6797         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6798     }
6799 #endif
6800     return -1; //not reached
6801 }
6802
6803 static int decode_unregistered_user_data(H264Context *h, int size){
6804     MpegEncContext * const s = &h->s;
6805     uint8_t user_data[16+256];
6806     int e, build, i;
6807
6808     if(size<16)
6809         return -1;
6810
6811     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6812         user_data[i]= get_bits(&s->gb, 8);
6813     }
6814
6815     user_data[i]= 0;
6816     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6817     if(e==1 && build>=0)
6818         h->x264_build= build;
6819
6820     if(s->avctx->debug & FF_DEBUG_BUGS)
6821         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6822
6823     for(; i<size; i++)
6824         skip_bits(&s->gb, 8);
6825
6826     return 0;
6827 }
6828
6829 static int decode_sei(H264Context *h){
6830     MpegEncContext * const s = &h->s;
6831
6832     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6833         int size, type;
6834
6835         type=0;
6836         do{
6837             type+= show_bits(&s->gb, 8);
6838         }while(get_bits(&s->gb, 8) == 255);
6839
6840         size=0;
6841         do{
6842             size+= show_bits(&s->gb, 8);
6843         }while(get_bits(&s->gb, 8) == 255);
6844
6845         switch(type){
6846         case 5:
6847             if(decode_unregistered_user_data(h, size) < 0)
6848                 return -1;
6849             break;
6850         default:
6851             skip_bits(&s->gb, 8*size);
6852         }
6853
6854         //FIXME check bits here
6855         align_get_bits(&s->gb);
6856     }
6857
6858     return 0;
6859 }
6860
6861 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6862     MpegEncContext * const s = &h->s;
6863     int cpb_count, i;
6864     cpb_count = get_ue_golomb(&s->gb) + 1;
6865     get_bits(&s->gb, 4); /* bit_rate_scale */
6866     get_bits(&s->gb, 4); /* cpb_size_scale */
6867     for(i=0; i<cpb_count; i++){
6868         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6869         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6870         get_bits1(&s->gb);     /* cbr_flag */
6871     }
6872     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6873     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6874     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6875     get_bits(&s->gb, 5); /* time_offset_length */
6876 }
6877
6878 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6879     MpegEncContext * const s = &h->s;
6880     int aspect_ratio_info_present_flag;
6881     unsigned int aspect_ratio_idc;
6882     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6883
6884     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6885
6886     if( aspect_ratio_info_present_flag ) {
6887         aspect_ratio_idc= get_bits(&s->gb, 8);
6888         if( aspect_ratio_idc == EXTENDED_SAR ) {
6889             sps->sar.num= get_bits(&s->gb, 16);
6890             sps->sar.den= get_bits(&s->gb, 16);
6891         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6892             sps->sar=  pixel_aspect[aspect_ratio_idc];
6893         }else{
6894             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6895             return -1;
6896         }
6897     }else{
6898         sps->sar.num=
6899         sps->sar.den= 0;
6900     }
6901 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6902
6903     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6904         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6905     }
6906
6907     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6908         get_bits(&s->gb, 3);    /* video_format */
6909         get_bits1(&s->gb);      /* video_full_range_flag */
6910         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6911             get_bits(&s->gb, 8); /* colour_primaries */
6912             get_bits(&s->gb, 8); /* transfer_characteristics */
6913             get_bits(&s->gb, 8); /* matrix_coefficients */
6914         }
6915     }
6916
6917     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6918         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6919         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6920     }
6921
6922     sps->timing_info_present_flag = get_bits1(&s->gb);
6923     if(sps->timing_info_present_flag){
6924         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6925         sps->time_scale = get_bits_long(&s->gb, 32);
6926         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6927     }
6928
6929     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6930     if(nal_hrd_parameters_present_flag)
6931         decode_hrd_parameters(h, sps);
6932     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6933     if(vcl_hrd_parameters_present_flag)
6934         decode_hrd_parameters(h, sps);
6935     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6936         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6937     get_bits1(&s->gb);         /* pic_struct_present_flag */
6938
6939     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6940     if(sps->bitstream_restriction_flag){
6941         unsigned int num_reorder_frames;
6942         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6943         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6944         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6945         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6946         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6947         num_reorder_frames= get_ue_golomb(&s->gb);
6948         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6949
6950         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6951             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6952             return -1;
6953         }
6954
6955         sps->num_reorder_frames= num_reorder_frames;
6956     }
6957
6958     return 0;
6959 }
6960
6961 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6962                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6963     MpegEncContext * const s = &h->s;
6964     int i, last = 8, next = 8;
6965     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6966     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6967         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6968     else
6969     for(i=0;i<size;i++){
6970         if(next)
6971             next = (last + get_se_golomb(&s->gb)) & 0xff;
6972         if(!i && !next){ /* matrix not written, we use the preset one */
6973             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6974             break;
6975         }
6976         last = factors[scan[i]] = next ? next : last;
6977     }
6978 }
6979
6980 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6981                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6982     MpegEncContext * const s = &h->s;
6983     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6984     const uint8_t *fallback[4] = {
6985         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6986         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6987         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6988         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6989     };
6990     if(get_bits1(&s->gb)){
6991         sps->scaling_matrix_present |= is_sps;
6992         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6993         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6994         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6995         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6996         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6997         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6998         if(is_sps || pps->transform_8x8_mode){
6999             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7000             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7001         }
7002     }
7003 }
7004
7005 /**
7006  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7007  */
7008 static void *
7009 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7010                     const size_t size, const char *name)
7011 {
7012     if(id>=max) {
7013         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7014         return NULL;
7015     }
7016
7017     if(!vec[id]) {
7018         vec[id] = av_mallocz(size);
7019         if(vec[id] == NULL)
7020             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7021     }
7022     return vec[id];
7023 }
7024
7025 static inline int decode_seq_parameter_set(H264Context *h){
7026     MpegEncContext * const s = &h->s;
7027     int profile_idc, level_idc;
7028     unsigned int sps_id, tmp, mb_width, mb_height;
7029     int i;
7030     SPS *sps;
7031
7032     profile_idc= get_bits(&s->gb, 8);
7033     get_bits1(&s->gb);   //constraint_set0_flag
7034     get_bits1(&s->gb);   //constraint_set1_flag
7035     get_bits1(&s->gb);   //constraint_set2_flag
7036     get_bits1(&s->gb);   //constraint_set3_flag
7037     get_bits(&s->gb, 4); // reserved
7038     level_idc= get_bits(&s->gb, 8);
7039     sps_id= get_ue_golomb(&s->gb);
7040
7041     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7042     if(sps == NULL)
7043         return -1;
7044
7045     sps->profile_idc= profile_idc;
7046     sps->level_idc= level_idc;
7047
7048     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7049     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7050     sps->scaling_matrix_present = 0;
7051
7052     if(sps->profile_idc >= 100){ //high profile
7053         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7054         if(sps->chroma_format_idc == 3)
7055             get_bits1(&s->gb);  //residual_color_transform_flag
7056         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7057         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7058         sps->transform_bypass = get_bits1(&s->gb);
7059         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7060     }else{
7061         sps->chroma_format_idc= 1;
7062     }
7063
7064     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7065     sps->poc_type= get_ue_golomb(&s->gb);
7066
7067     if(sps->poc_type == 0){ //FIXME #define
7068         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7069     } else if(sps->poc_type == 1){//FIXME #define
7070         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7071         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7072         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7073         tmp= get_ue_golomb(&s->gb);
7074
7075         if(tmp >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7076             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7077             return -1;
7078         }
7079         sps->poc_cycle_length= tmp;
7080
7081         for(i=0; i<sps->poc_cycle_length; i++)
7082             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7083     }else if(sps->poc_type != 2){
7084         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7085         return -1;
7086     }
7087
7088     tmp= get_ue_golomb(&s->gb);
7089     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7090         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7091         return -1;
7092     }
7093     sps->ref_frame_count= tmp;
7094     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7095     mb_width= get_ue_golomb(&s->gb) + 1;
7096     mb_height= get_ue_golomb(&s->gb) + 1;
7097     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7098        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7099         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7100         return -1;
7101     }
7102     sps->mb_width = mb_width;
7103     sps->mb_height= mb_height;
7104
7105     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7106     if(!sps->frame_mbs_only_flag)
7107         sps->mb_aff= get_bits1(&s->gb);
7108     else
7109         sps->mb_aff= 0;
7110
7111     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7112
7113 #ifndef ALLOW_INTERLACE
7114     if(sps->mb_aff)
7115         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7116 #endif
7117     sps->crop= get_bits1(&s->gb);
7118     if(sps->crop){
7119         sps->crop_left  = get_ue_golomb(&s->gb);
7120         sps->crop_right = get_ue_golomb(&s->gb);
7121         sps->crop_top   = get_ue_golomb(&s->gb);
7122         sps->crop_bottom= get_ue_golomb(&s->gb);
7123         if(sps->crop_left || sps->crop_top){
7124             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7125         }
7126         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7127             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7128         }
7129     }else{
7130         sps->crop_left  =
7131         sps->crop_right =
7132         sps->crop_top   =
7133         sps->crop_bottom= 0;
7134     }
7135
7136     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7137     if( sps->vui_parameters_present_flag )
7138         decode_vui_parameters(h, sps);
7139
7140     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7141         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7142                sps_id, sps->profile_idc, sps->level_idc,
7143                sps->poc_type,
7144                sps->ref_frame_count,
7145                sps->mb_width, sps->mb_height,
7146                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7147                sps->direct_8x8_inference_flag ? "8B8" : "",
7148                sps->crop_left, sps->crop_right,
7149                sps->crop_top, sps->crop_bottom,
7150                sps->vui_parameters_present_flag ? "VUI" : "",
7151                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7152                );
7153     }
7154     return 0;
7155 }
7156
7157 static void
7158 build_qp_table(PPS *pps, int t, int index)
7159 {
7160     int i;
7161     for(i = 0; i < 52; i++)
7162         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7163 }
7164
7165 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7166     MpegEncContext * const s = &h->s;
7167     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7168     PPS *pps;
7169
7170     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7171     if(pps == NULL)
7172         return -1;
7173
7174     tmp= get_ue_golomb(&s->gb);
7175     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7176         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7177         return -1;
7178     }
7179     pps->sps_id= tmp;
7180
7181     pps->cabac= get_bits1(&s->gb);
7182     pps->pic_order_present= get_bits1(&s->gb);
7183     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7184     if(pps->slice_group_count > 1 ){
7185         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7186         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7187         switch(pps->mb_slice_group_map_type){
7188         case 0:
7189 #if 0
7190 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7191 |    run_length[ i ]                                |1  |ue(v)   |
7192 #endif
7193             break;
7194         case 2:
7195 #if 0
7196 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7197 |{                                                  |   |        |
7198 |    top_left_mb[ i ]                               |1  |ue(v)   |
7199 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7200 |   }                                               |   |        |
7201 #endif
7202             break;
7203         case 3:
7204         case 4:
7205         case 5:
7206 #if 0
7207 |   slice_group_change_direction_flag               |1  |u(1)    |
7208 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7209 #endif
7210             break;
7211         case 6:
7212 #if 0
7213 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7214 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7215 |)                                                  |   |        |
7216 |    slice_group_id[ i ]                            |1  |u(v)    |
7217 #endif
7218             break;
7219         }
7220     }
7221     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7222     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7223     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7224         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7225         pps->ref_count[0]= pps->ref_count[1]= 1;
7226         return -1;
7227     }
7228
7229     pps->weighted_pred= get_bits1(&s->gb);
7230     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7231     pps->init_qp= get_se_golomb(&s->gb) + 26;
7232     pps->init_qs= get_se_golomb(&s->gb) + 26;
7233     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7234     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7235     pps->constrained_intra_pred= get_bits1(&s->gb);
7236     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7237
7238     pps->transform_8x8_mode= 0;
7239     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7240     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7241     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7242
7243     if(get_bits_count(&s->gb) < bit_length){
7244         pps->transform_8x8_mode= get_bits1(&s->gb);
7245         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7246         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7247     } else {
7248         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7249     }
7250
7251     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7252     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7253     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7254         h->pps.chroma_qp_diff= 1;
7255
7256     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7257         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7258                pps_id, pps->sps_id,
7259                pps->cabac ? "CABAC" : "CAVLC",
7260                pps->slice_group_count,
7261                pps->ref_count[0], pps->ref_count[1],
7262                pps->weighted_pred ? "weighted" : "",
7263                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7264                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7265                pps->constrained_intra_pred ? "CONSTR" : "",
7266                pps->redundant_pic_cnt_present ? "REDU" : "",
7267                pps->transform_8x8_mode ? "8x8DCT" : ""
7268                );
7269     }
7270
7271     return 0;
7272 }
7273
7274 /**
7275  * Call decode_slice() for each context.
7276  *
7277  * @param h h264 master context
7278  * @param context_count number of contexts to execute
7279  */
7280 static void execute_decode_slices(H264Context *h, int context_count){
7281     MpegEncContext * const s = &h->s;
7282     AVCodecContext * const avctx= s->avctx;
7283     H264Context *hx;
7284     int i;
7285
7286     if(context_count == 1) {
7287         decode_slice(avctx, h);
7288     } else {
7289         for(i = 1; i < context_count; i++) {
7290             hx = h->thread_context[i];
7291             hx->s.error_recognition = avctx->error_recognition;
7292             hx->s.error_count = 0;
7293         }
7294
7295         avctx->execute(avctx, (void *)decode_slice,
7296                        (void **)h->thread_context, NULL, context_count);
7297
7298         /* pull back stuff from slices to master context */
7299         hx = h->thread_context[context_count - 1];
7300         s->mb_x = hx->s.mb_x;
7301         s->mb_y = hx->s.mb_y;
7302         s->dropable = hx->s.dropable;
7303         s->picture_structure = hx->s.picture_structure;
7304         for(i = 1; i < context_count; i++)
7305             h->s.error_count += h->thread_context[i]->s.error_count;
7306     }
7307 }
7308
7309
7310 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7311     MpegEncContext * const s = &h->s;
7312     AVCodecContext * const avctx= s->avctx;
7313     int buf_index=0;
7314     H264Context *hx; ///< thread context
7315     int context_count = 0;
7316
7317     h->max_contexts = avctx->thread_count;
7318 #if 0
7319     int i;
7320     for(i=0; i<50; i++){
7321         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7322     }
7323 #endif
7324     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7325         h->current_slice = 0;
7326         if (!s->first_field)
7327             s->current_picture_ptr= NULL;
7328     }
7329
7330     for(;;){
7331         int consumed;
7332         int dst_length;
7333         int bit_length;
7334         const uint8_t *ptr;
7335         int i, nalsize = 0;
7336         int err;
7337
7338         if(h->is_avc) {
7339             if(buf_index >= buf_size) break;
7340             nalsize = 0;
7341             for(i = 0; i < h->nal_length_size; i++)
7342                 nalsize = (nalsize << 8) | buf[buf_index++];
7343             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7344                 if(nalsize == 1){
7345                     buf_index++;
7346                     continue;
7347                 }else{
7348                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7349                     break;
7350                 }
7351             }
7352         } else {
7353             // start code prefix search
7354             for(; buf_index + 3 < buf_size; buf_index++){
7355                 // This should always succeed in the first iteration.
7356                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7357                     break;
7358             }
7359
7360             if(buf_index+3 >= buf_size) break;
7361
7362             buf_index+=3;
7363         }
7364
7365         hx = h->thread_context[context_count];
7366
7367         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7368         if (ptr==NULL || dst_length < 0){
7369             return -1;
7370         }
7371         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7372             dst_length--;
7373         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7374
7375         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7376             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7377         }
7378
7379         if (h->is_avc && (nalsize != consumed)){
7380             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7381             consumed= nalsize;
7382         }
7383
7384         buf_index += consumed;
7385
7386         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7387            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7388             continue;
7389
7390       again:
7391         err = 0;
7392         switch(hx->nal_unit_type){
7393         case NAL_IDR_SLICE:
7394             if (h->nal_unit_type != NAL_IDR_SLICE) {
7395                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7396                 return -1;
7397             }
7398             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7399         case NAL_SLICE:
7400             init_get_bits(&hx->s.gb, ptr, bit_length);
7401             hx->intra_gb_ptr=
7402             hx->inter_gb_ptr= &hx->s.gb;
7403             hx->s.data_partitioning = 0;
7404
7405             if((err = decode_slice_header(hx, h)))
7406                break;
7407
7408             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7409             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7410                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7411                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7412                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7413                && avctx->skip_frame < AVDISCARD_ALL)
7414                 context_count++;
7415             break;
7416         case NAL_DPA:
7417             init_get_bits(&hx->s.gb, ptr, bit_length);
7418             hx->intra_gb_ptr=
7419             hx->inter_gb_ptr= NULL;
7420             hx->s.data_partitioning = 1;
7421
7422             err = decode_slice_header(hx, h);
7423             break;
7424         case NAL_DPB:
7425             init_get_bits(&hx->intra_gb, ptr, bit_length);
7426             hx->intra_gb_ptr= &hx->intra_gb;
7427             break;
7428         case NAL_DPC:
7429             init_get_bits(&hx->inter_gb, ptr, bit_length);
7430             hx->inter_gb_ptr= &hx->inter_gb;
7431
7432             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7433                && s->context_initialized
7434                && s->hurry_up < 5
7435                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7436                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7437                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7438                && avctx->skip_frame < AVDISCARD_ALL)
7439                 context_count++;
7440             break;
7441         case NAL_SEI:
7442             init_get_bits(&s->gb, ptr, bit_length);
7443             decode_sei(h);
7444             break;
7445         case NAL_SPS:
7446             init_get_bits(&s->gb, ptr, bit_length);
7447             decode_seq_parameter_set(h);
7448
7449             if(s->flags& CODEC_FLAG_LOW_DELAY)
7450                 s->low_delay=1;
7451
7452             if(avctx->has_b_frames < 2)
7453                 avctx->has_b_frames= !s->low_delay;
7454             break;
7455         case NAL_PPS:
7456             init_get_bits(&s->gb, ptr, bit_length);
7457
7458             decode_picture_parameter_set(h, bit_length);
7459
7460             break;
7461         case NAL_AUD:
7462         case NAL_END_SEQUENCE:
7463         case NAL_END_STREAM:
7464         case NAL_FILLER_DATA:
7465         case NAL_SPS_EXT:
7466         case NAL_AUXILIARY_SLICE:
7467             break;
7468         default:
7469             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7470         }
7471
7472         if(context_count == h->max_contexts) {
7473             execute_decode_slices(h, context_count);
7474             context_count = 0;
7475         }
7476
7477         if (err < 0)
7478             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7479         else if(err == 1) {
7480             /* Slice could not be decoded in parallel mode, copy down
7481              * NAL unit stuff to context 0 and restart. Note that
7482              * rbsp_buffer is not transferred, but since we no longer
7483              * run in parallel mode this should not be an issue. */
7484             h->nal_unit_type = hx->nal_unit_type;
7485             h->nal_ref_idc   = hx->nal_ref_idc;
7486             hx = h;
7487             goto again;
7488         }
7489     }
7490     if(context_count)
7491         execute_decode_slices(h, context_count);
7492     return buf_index;
7493 }
7494
7495 /**
7496  * returns the number of bytes consumed for building the current frame
7497  */
7498 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7499         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7500         if(pos+10>buf_size) pos=buf_size; // oops ;)
7501
7502         return pos;
7503 }
7504
7505 static int decode_frame(AVCodecContext *avctx,
7506                              void *data, int *data_size,
7507                              const uint8_t *buf, int buf_size)
7508 {
7509     H264Context *h = avctx->priv_data;
7510     MpegEncContext *s = &h->s;
7511     AVFrame *pict = data;
7512     int buf_index;
7513
7514     s->flags= avctx->flags;
7515     s->flags2= avctx->flags2;
7516
7517    /* end of stream, output what is still in the buffers */
7518     if (buf_size == 0) {
7519         Picture *out;
7520         int i, out_idx;
7521
7522 //FIXME factorize this with the output code below
7523         out = h->delayed_pic[0];
7524         out_idx = 0;
7525         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7526             if(h->delayed_pic[i]->poc < out->poc){
7527                 out = h->delayed_pic[i];
7528                 out_idx = i;
7529             }
7530
7531         for(i=out_idx; h->delayed_pic[i]; i++)
7532             h->delayed_pic[i] = h->delayed_pic[i+1];
7533
7534         if(out){
7535             *data_size = sizeof(AVFrame);
7536             *pict= *(AVFrame*)out;
7537         }
7538
7539         return 0;
7540     }
7541
7542     if(h->is_avc && !h->got_avcC) {
7543         int i, cnt, nalsize;
7544         unsigned char *p = avctx->extradata;
7545         if(avctx->extradata_size < 7) {
7546             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7547             return -1;
7548         }
7549         if(*p != 1) {
7550             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7551             return -1;
7552         }
7553         /* sps and pps in the avcC always have length coded with 2 bytes,
7554            so put a fake nal_length_size = 2 while parsing them */
7555         h->nal_length_size = 2;
7556         // Decode sps from avcC
7557         cnt = *(p+5) & 0x1f; // Number of sps
7558         p += 6;
7559         for (i = 0; i < cnt; i++) {
7560             nalsize = AV_RB16(p) + 2;
7561             if(decode_nal_units(h, p, nalsize) < 0) {
7562                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7563                 return -1;
7564             }
7565             p += nalsize;
7566         }
7567         // Decode pps from avcC
7568         cnt = *(p++); // Number of pps
7569         for (i = 0; i < cnt; i++) {
7570             nalsize = AV_RB16(p) + 2;
7571             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7572                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7573                 return -1;
7574             }
7575             p += nalsize;
7576         }
7577         // Now store right nal length size, that will be use to parse all other nals
7578         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7579         // Do not reparse avcC
7580         h->got_avcC = 1;
7581     }
7582
7583     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7584         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7585             return -1;
7586         h->got_avcC = 1;
7587     }
7588
7589     buf_index=decode_nal_units(h, buf, buf_size);
7590     if(buf_index < 0)
7591         return -1;
7592
7593     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7594         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7595         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7596         return -1;
7597     }
7598
7599     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7600         Picture *out = s->current_picture_ptr;
7601         Picture *cur = s->current_picture_ptr;
7602         int i, pics, cross_idr, out_of_order, out_idx;
7603
7604         s->mb_y= 0;
7605
7606         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7607         s->current_picture_ptr->pict_type= s->pict_type;
7608
7609         if(!s->dropable) {
7610             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7611             h->prev_poc_msb= h->poc_msb;
7612             h->prev_poc_lsb= h->poc_lsb;
7613         }
7614         h->prev_frame_num_offset= h->frame_num_offset;
7615         h->prev_frame_num= h->frame_num;
7616
7617         /*
7618          * FIXME: Error handling code does not seem to support interlaced
7619          * when slices span multiple rows
7620          * The ff_er_add_slice calls don't work right for bottom
7621          * fields; they cause massive erroneous error concealing
7622          * Error marking covers both fields (top and bottom).
7623          * This causes a mismatched s->error_count
7624          * and a bad error table. Further, the error count goes to
7625          * INT_MAX when called for bottom field, because mb_y is
7626          * past end by one (callers fault) and resync_mb_y != 0
7627          * causes problems for the first MB line, too.
7628          */
7629         if (!FIELD_PICTURE)
7630             ff_er_frame_end(s);
7631
7632         MPV_frame_end(s);
7633
7634         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7635             /* Wait for second field. */
7636             *data_size = 0;
7637
7638         } else {
7639             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7640             /* Derive top_field_first from field pocs. */
7641             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7642
7643         //FIXME do something with unavailable reference frames
7644
7645             /* Sort B-frames into display order */
7646
7647             if(h->sps.bitstream_restriction_flag
7648                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7649                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7650                 s->low_delay = 0;
7651             }
7652
7653             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7654                && !h->sps.bitstream_restriction_flag){
7655                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7656                 s->low_delay= 0;
7657             }
7658
7659             pics = 0;
7660             while(h->delayed_pic[pics]) pics++;
7661
7662             assert(pics <= MAX_DELAYED_PIC_COUNT);
7663
7664             h->delayed_pic[pics++] = cur;
7665             if(cur->reference == 0)
7666                 cur->reference = DELAYED_PIC_REF;
7667
7668             out = h->delayed_pic[0];
7669             out_idx = 0;
7670             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7671                 if(h->delayed_pic[i]->poc < out->poc){
7672                     out = h->delayed_pic[i];
7673                     out_idx = i;
7674                 }
7675             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7676
7677             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7678
7679             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7680                 { }
7681             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7682                || (s->low_delay &&
7683                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7684                  || cur->pict_type == FF_B_TYPE)))
7685             {
7686                 s->low_delay = 0;
7687                 s->avctx->has_b_frames++;
7688             }
7689
7690             if(out_of_order || pics > s->avctx->has_b_frames){
7691                 out->reference &= ~DELAYED_PIC_REF;
7692                 for(i=out_idx; h->delayed_pic[i]; i++)
7693                     h->delayed_pic[i] = h->delayed_pic[i+1];
7694             }
7695             if(!out_of_order && pics > s->avctx->has_b_frames){
7696                 *data_size = sizeof(AVFrame);
7697
7698                 h->outputed_poc = out->poc;
7699                 *pict= *(AVFrame*)out;
7700             }else{
7701                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7702             }
7703         }
7704     }
7705
7706     assert(pict->data[0] || !*data_size);
7707     ff_print_debug_info(s, pict);
7708 //printf("out %d\n", (int)pict->data[0]);
7709 #if 0 //?
7710
7711     /* Return the Picture timestamp as the frame number */
7712     /* we subtract 1 because it is added on utils.c     */
7713     avctx->frame_number = s->picture_number - 1;
7714 #endif
7715     return get_consumed_bytes(s, buf_index, buf_size);
7716 }
7717 #if 0
7718 static inline void fill_mb_avail(H264Context *h){
7719     MpegEncContext * const s = &h->s;
7720     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7721
7722     if(s->mb_y){
7723         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7724         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7725         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7726     }else{
7727         h->mb_avail[0]=
7728         h->mb_avail[1]=
7729         h->mb_avail[2]= 0;
7730     }
7731     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7732     h->mb_avail[4]= 1; //FIXME move out
7733     h->mb_avail[5]= 0; //FIXME move out
7734 }
7735 #endif
7736
7737 #ifdef TEST
7738 #undef printf
7739 #undef random
7740 #define COUNT 8000
7741 #define SIZE (COUNT*40)
7742 int main(void){
7743     int i;
7744     uint8_t temp[SIZE];
7745     PutBitContext pb;
7746     GetBitContext gb;
7747 //    int int_temp[10000];
7748     DSPContext dsp;
7749     AVCodecContext avctx;
7750
7751     dsputil_init(&dsp, &avctx);
7752
7753     init_put_bits(&pb, temp, SIZE);
7754     printf("testing unsigned exp golomb\n");
7755     for(i=0; i<COUNT; i++){
7756         START_TIMER
7757         set_ue_golomb(&pb, i);
7758         STOP_TIMER("set_ue_golomb");
7759     }
7760     flush_put_bits(&pb);
7761
7762     init_get_bits(&gb, temp, 8*SIZE);
7763     for(i=0; i<COUNT; i++){
7764         int j, s;
7765
7766         s= show_bits(&gb, 24);
7767
7768         START_TIMER
7769         j= get_ue_golomb(&gb);
7770         if(j != i){
7771             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7772 //            return -1;
7773         }
7774         STOP_TIMER("get_ue_golomb");
7775     }
7776
7777
7778     init_put_bits(&pb, temp, SIZE);
7779     printf("testing signed exp golomb\n");
7780     for(i=0; i<COUNT; i++){
7781         START_TIMER
7782         set_se_golomb(&pb, i - COUNT/2);
7783         STOP_TIMER("set_se_golomb");
7784     }
7785     flush_put_bits(&pb);
7786
7787     init_get_bits(&gb, temp, 8*SIZE);
7788     for(i=0; i<COUNT; i++){
7789         int j, s;
7790
7791         s= show_bits(&gb, 24);
7792
7793         START_TIMER
7794         j= get_se_golomb(&gb);
7795         if(j != i - COUNT/2){
7796             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7797 //            return -1;
7798         }
7799         STOP_TIMER("get_se_golomb");
7800     }
7801
7802 #if 0
7803     printf("testing 4x4 (I)DCT\n");
7804
7805     DCTELEM block[16];
7806     uint8_t src[16], ref[16];
7807     uint64_t error= 0, max_error=0;
7808
7809     for(i=0; i<COUNT; i++){
7810         int j;
7811 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7812         for(j=0; j<16; j++){
7813             ref[j]= random()%255;
7814             src[j]= random()%255;
7815         }
7816
7817         h264_diff_dct_c(block, src, ref, 4);
7818
7819         //normalize
7820         for(j=0; j<16; j++){
7821 //            printf("%d ", block[j]);
7822             block[j]= block[j]*4;
7823             if(j&1) block[j]= (block[j]*4 + 2)/5;
7824             if(j&4) block[j]= (block[j]*4 + 2)/5;
7825         }
7826 //        printf("\n");
7827
7828         s->dsp.h264_idct_add(ref, block, 4);
7829 /*        for(j=0; j<16; j++){
7830             printf("%d ", ref[j]);
7831         }
7832         printf("\n");*/
7833
7834         for(j=0; j<16; j++){
7835             int diff= FFABS(src[j] - ref[j]);
7836
7837             error+= diff*diff;
7838             max_error= FFMAX(max_error, diff);
7839         }
7840     }
7841     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7842     printf("testing quantizer\n");
7843     for(qp=0; qp<52; qp++){
7844         for(i=0; i<16; i++)
7845             src1_block[i]= src2_block[i]= random()%255;
7846
7847     }
7848     printf("Testing NAL layer\n");
7849
7850     uint8_t bitstream[COUNT];
7851     uint8_t nal[COUNT*2];
7852     H264Context h;
7853     memset(&h, 0, sizeof(H264Context));
7854
7855     for(i=0; i<COUNT; i++){
7856         int zeros= i;
7857         int nal_length;
7858         int consumed;
7859         int out_length;
7860         uint8_t *out;
7861         int j;
7862
7863         for(j=0; j<COUNT; j++){
7864             bitstream[j]= (random() % 255) + 1;
7865         }
7866
7867         for(j=0; j<zeros; j++){
7868             int pos= random() % COUNT;
7869             while(bitstream[pos] == 0){
7870                 pos++;
7871                 pos %= COUNT;
7872             }
7873             bitstream[pos]=0;
7874         }
7875
7876         START_TIMER
7877
7878         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7879         if(nal_length<0){
7880             printf("encoding failed\n");
7881             return -1;
7882         }
7883
7884         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7885
7886         STOP_TIMER("NAL")
7887
7888         if(out_length != COUNT){
7889             printf("incorrect length %d %d\n", out_length, COUNT);
7890             return -1;
7891         }
7892
7893         if(consumed != nal_length){
7894             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7895             return -1;
7896         }
7897
7898         if(memcmp(bitstream, out, COUNT)){
7899             printf("mismatch\n");
7900             return -1;
7901         }
7902     }
7903 #endif
7904
7905     printf("Testing RBSP\n");
7906
7907
7908     return 0;
7909 }
7910 #endif /* TEST */
7911
7912
7913 static av_cold int decode_end(AVCodecContext *avctx)
7914 {
7915     H264Context *h = avctx->priv_data;
7916     MpegEncContext *s = &h->s;
7917     int i;
7918
7919     av_freep(&h->rbsp_buffer[0]);
7920     av_freep(&h->rbsp_buffer[1]);
7921     free_tables(h); //FIXME cleanup init stuff perhaps
7922
7923     for(i = 0; i < MAX_SPS_COUNT; i++)
7924         av_freep(h->sps_buffers + i);
7925
7926     for(i = 0; i < MAX_PPS_COUNT; i++)
7927         av_freep(h->pps_buffers + i);
7928
7929     MPV_common_end(s);
7930
7931 //    memset(h, 0, sizeof(H264Context));
7932
7933     return 0;
7934 }
7935
7936
7937 AVCodec h264_decoder = {
7938     "h264",
7939     CODEC_TYPE_VIDEO,
7940     CODEC_ID_H264,
7941     sizeof(H264Context),
7942     decode_init,
7943     NULL,
7944     decode_end,
7945     decode_frame,
7946     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7947     .flush= flush_dpb,
7948     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7949 };
7950
7951 #include "svq3.c"