libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 extern int ff_VDPAU_h264_set_reference_frames(H264Context *h);
  76 extern int ff_VDPAU_h264_picture_complete(H264Context *h, const uint8_t *buf, int buf_size);
  77 extern void ff_VDPAU_h264_set_reference_frames_count(H264Context *h);
  78
  79 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  80 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  81 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  83 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  84
  85 static av_always_inline uint32_t pack16to32(int a, int b){
  86 #ifdef WORDS_BIGENDIAN
  87    return (b&0xFFFF) + (a<<16);
  88 #else
  89    return (a&0xFFFF) + (b<<16);
  90 #endif
  91 }
  92
  93 static const uint8_t rem6[52]={
  94 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  95 };
  96
  97 static const uint8_t div6[52]={
  98 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  99 };
 100
 101 static const int left_block_options[4][8]={
 102     {0,1,2,3,7,10,8,11},
 103     {2,2,3,3,8,11,8,11},
 104     {0,0,1,1,7,10,7,10},
 105     {0,2,0,2,7,10,7,10}
 106 };
 107
 108 static const enum PixelFormat pixfmt_vdpau_h264_baseline_420[] = {
 109                                            PIX_FMT_VDPAU_H264_BASELINE,
 110                                            PIX_FMT_NONE};
 111 static const enum PixelFormat pixfmt_vdpau_h264_main_420[] = {
 112                                            PIX_FMT_VDPAU_H264_MAIN,
 113                                            PIX_FMT_NONE};
 114 static const enum PixelFormat pixfmt_vdpau_h264_high_420[] = {
 115                                            PIX_FMT_VDPAU_H264_HIGH,
 116                                            PIX_FMT_NONE};
 117
 118 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 119     MpegEncContext * const s = &h->s;
 120     const int mb_xy= h->mb_xy;
 121     int topleft_xy, top_xy, topright_xy, left_xy[2];
 122     int topleft_type, top_type, topright_type, left_type[2];
 123     int * left_block;
 124     int topleft_partition= -1;
 125     int i;
 126
 127     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 128
 129     //FIXME deblocking could skip the intra and nnz parts.
 130     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 131         return;
 132
 133     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 134      * stuff, I can't imagine that these complex rules are worth it. */
 135
 136     topleft_xy = top_xy - 1;
 137     topright_xy= top_xy + 1;
 138     left_xy[1] = left_xy[0] = mb_xy-1;
 139     left_block = left_block_options[0];
 140     if(FRAME_MBAFF){
 141         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 142         const int top_pair_xy      = pair_xy     - s->mb_stride;
 143         const int topleft_pair_xy  = top_pair_xy - 1;
 144         const int topright_pair_xy = top_pair_xy + 1;
 145         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 146         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 147         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 148         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 149         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 150         const int bottom = (s->mb_y & 1);
 151         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 152         if (bottom
 153                 ? !curr_mb_frame_flag // bottom macroblock
 154                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 155                 ) {
 156             top_xy -= s->mb_stride;
 157         }
 158         if (bottom
 159                 ? !curr_mb_frame_flag // bottom macroblock
 160                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 161                 ) {
 162             topleft_xy -= s->mb_stride;
 163         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 164             topleft_xy += s->mb_stride;
 165             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 166             topleft_partition = 0;
 167         }
 168         if (bottom
 169                 ? !curr_mb_frame_flag // bottom macroblock
 170                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 171                 ) {
 172             topright_xy -= s->mb_stride;
 173         }
 174         if (left_mb_frame_flag != curr_mb_frame_flag) {
 175             left_xy[1] = left_xy[0] = pair_xy - 1;
 176             if (curr_mb_frame_flag) {
 177                 if (bottom) {
 178                     left_block = left_block_options[1];
 179                 } else {
 180                     left_block= left_block_options[2];
 181                 }
 182             } else {
 183                 left_xy[1] += s->mb_stride;
 184                 left_block = left_block_options[3];
 185             }
 186         }
 187     }
 188
 189     h->top_mb_xy = top_xy;
 190     h->left_mb_xy[0] = left_xy[0];
 191     h->left_mb_xy[1] = left_xy[1];
 192     if(for_deblock){
 193         topleft_type = 0;
 194         topright_type = 0;
 195         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 196         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 197         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 198
 199         if(MB_MBAFF && !IS_INTRA(mb_type)){
 200             int list;
 201             for(list=0; list<h->list_count; list++){
 202                 //These values where changed for ease of performing MC, we need to change them back
 203                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 204                 //the MC code from changing ref_cache and rather use a temporary array.
 205                 if(USES_LIST(mb_type,list)){
 206                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 207                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 209                     ref += h->b8_stride;
 210                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 211                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 212                 }
 213             }
 214         }
 215     }else{
 216         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 217         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 218         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 219         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 220         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 221
 222     if(IS_INTRA(mb_type)){
 223         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 224         h->topleft_samples_available=
 225         h->top_samples_available=
 226         h->left_samples_available= 0xFFFF;
 227         h->topright_samples_available= 0xEEEA;
 228
 229         if(!(top_type & type_mask)){
 230             h->topleft_samples_available= 0xB3FF;
 231             h->top_samples_available= 0x33FF;
 232             h->topright_samples_available= 0x26EA;
 233         }
 234         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 235             if(IS_INTERLACED(mb_type)){
 236                 if(!(left_type[0] & type_mask)){
 237                     h->topleft_samples_available&= 0xDFFF;
 238                     h->left_samples_available&= 0x5FFF;
 239                 }
 240                 if(!(left_type[1] & type_mask)){
 241                     h->topleft_samples_available&= 0xFF5F;
 242                     h->left_samples_available&= 0xFF5F;
 243                 }
 244             }else{
 245                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 246                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 247                 assert(left_xy[0] == left_xy[1]);
 248                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 249                     h->topleft_samples_available&= 0xDF5F;
 250                     h->left_samples_available&= 0x5F5F;
 251                 }
 252             }
 253         }else{
 254             if(!(left_type[0] & type_mask)){
 255                 h->topleft_samples_available&= 0xDF5F;
 256                 h->left_samples_available&= 0x5F5F;
 257             }
 258         }
 259
 260         if(!(topleft_type & type_mask))
 261             h->topleft_samples_available&= 0x7FFF;
 262
 263         if(!(topright_type & type_mask))
 264             h->topright_samples_available&= 0xFBFF;
 265
 266         if(IS_INTRA4x4(mb_type)){
 267             if(IS_INTRA4x4(top_type)){
 268                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 269                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 270                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 271                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 272             }else{
 273                 int pred;
 274                 if(!(top_type & type_mask))
 275                     pred= -1;
 276                 else{
 277                     pred= 2;
 278                 }
 279                 h->intra4x4_pred_mode_cache[4+8*0]=
 280                 h->intra4x4_pred_mode_cache[5+8*0]=
 281                 h->intra4x4_pred_mode_cache[6+8*0]=
 282                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 283             }
 284             for(i=0; i<2; i++){
 285                 if(IS_INTRA4x4(left_type[i])){
 286                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 287                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 288                 }else{
 289                     int pred;
 290                     if(!(left_type[i] & type_mask))
 291                         pred= -1;
 292                     else{
 293                         pred= 2;
 294                     }
 295                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 296                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 297                 }
 298             }
 299         }
 300     }
 301     }
 302
 303
 304 /*
 305 0 . T T. T T T T
 306 1 L . .L . . . .
 307 2 L . .L . . . .
 308 3 . T TL . . . .
 309 4 L . .L . . . .
 310 5 L . .. . . . .
 311 */
 312 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 313     if(top_type){
 314         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 315         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 316         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 317         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 318
 319         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 320         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 321
 322         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 323         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 324
 325     }else{
 326         h->non_zero_count_cache[4+8*0]=
 327         h->non_zero_count_cache[5+8*0]=
 328         h->non_zero_count_cache[6+8*0]=
 329         h->non_zero_count_cache[7+8*0]=
 330
 331         h->non_zero_count_cache[1+8*0]=
 332         h->non_zero_count_cache[2+8*0]=
 333
 334         h->non_zero_count_cache[1+8*3]=
 335         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336
 337     }
 338
 339     for (i=0; i<2; i++) {
 340         if(left_type[i]){
 341             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 342             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 343             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 344             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 345         }else{
 346             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 347             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 348             h->non_zero_count_cache[0+8*1 +   8*i]=
 349             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 350         }
 351     }
 352
 353     if( h->pps.cabac ) {
 354         // top_cbp
 355         if(top_type) {
 356             h->top_cbp = h->cbp_table[top_xy];
 357         } else if(IS_INTRA(mb_type)) {
 358             h->top_cbp = 0x1C0;
 359         } else {
 360             h->top_cbp = 0;
 361         }
 362         // left_cbp
 363         if (left_type[0]) {
 364             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 365         } else if(IS_INTRA(mb_type)) {
 366             h->left_cbp = 0x1C0;
 367         } else {
 368             h->left_cbp = 0;
 369         }
 370         if (left_type[0]) {
 371             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 372         }
 373         if (left_type[1]) {
 374             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 375         }
 376     }
 377
 378 #if 1
 379     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 380         int list;
 381         for(list=0; list<h->list_count; list++){
 382             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 383                 /*if(!h->mv_cache_clean[list]){
 384                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 385                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 386                     h->mv_cache_clean[list]= 1;
 387                 }*/
 388                 continue;
 389             }
 390             h->mv_cache_clean[list]= 0;
 391
 392             if(USES_LIST(top_type, list)){
 393                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 394                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 395                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 396                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 397                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 398                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 399                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 400                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 401                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 402                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 403             }else{
 404                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 405                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 406                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 407                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 408                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 409             }
 410
 411             for(i=0; i<2; i++){
 412                 int cache_idx = scan8[0] - 1 + i*2*8;
 413                 if(USES_LIST(left_type[i], list)){
 414                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 415                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 416                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 417                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 418                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 419                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 420                 }else{
 421                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 422                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 423                     h->ref_cache[list][cache_idx  ]=
 424                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425                 }
 426             }
 427
 428             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 429                 continue;
 430
 431             if(USES_LIST(topleft_type, list)){
 432                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 433                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 434                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 435                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 436             }else{
 437                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 438                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 439             }
 440
 441             if(USES_LIST(topright_type, list)){
 442                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 443                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 444                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 445                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 446             }else{
 447                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 448                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 449             }
 450
 451             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 452                 continue;
 453
 454             h->ref_cache[list][scan8[5 ]+1] =
 455             h->ref_cache[list][scan8[7 ]+1] =
 456             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 457             h->ref_cache[list][scan8[4 ]] =
 458             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 459             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 460             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 461             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 462             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 463             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 464
 465             if( h->pps.cabac ) {
 466                 /* XXX beurk, Load mvd */
 467                 if(USES_LIST(top_type, list)){
 468                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 471                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 472                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 473                 }else{
 474                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 475                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 477                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 478                 }
 479                 if(USES_LIST(left_type[0], list)){
 480                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 481                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 482                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 483                 }else{
 484                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 485                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 486                 }
 487                 if(USES_LIST(left_type[1], list)){
 488                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 489                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 490                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 491                 }else{
 492                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 493                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 494                 }
 495                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 496                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 497                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 498                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 499                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 500
 501                 if(h->slice_type_nos == FF_B_TYPE){
 502                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 503
 504                     if(IS_DIRECT(top_type)){
 505                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 506                     }else if(IS_8X8(top_type)){
 507                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 508                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 509                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 510                     }else{
 511                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 512                     }
 513
 514                     if(IS_DIRECT(left_type[0]))
 515                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 516                     else if(IS_8X8(left_type[0]))
 517                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 518                     else
 519                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 520
 521                     if(IS_DIRECT(left_type[1]))
 522                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 523                     else if(IS_8X8(left_type[1]))
 524                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 525                     else
 526                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 527                 }
 528             }
 529
 530             if(FRAME_MBAFF){
 531 #define MAP_MVS\
 532                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 533                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 534                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 535                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 536                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 537                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 538                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 539                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 540                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 541                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 542                 if(MB_FIELD){
 543 #define MAP_F2F(idx, mb_type)\
 544                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 545                         h->ref_cache[list][idx] <<= 1;\
 546                         h->mv_cache[list][idx][1] /= 2;\
 547                         h->mvd_cache[list][idx][1] /= 2;\
 548                     }
 549                     MAP_MVS
 550 #undef MAP_F2F
 551                 }else{
 552 #define MAP_F2F(idx, mb_type)\
 553                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 554                         h->ref_cache[list][idx] >>= 1;\
 555                         h->mv_cache[list][idx][1] <<= 1;\
 556                         h->mvd_cache[list][idx][1] <<= 1;\
 557                     }
 558                     MAP_MVS
 559 #undef MAP_F2F
 560                 }
 561             }
 562         }
 563     }
 564 #endif
 565
 566     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 567 }
 568
 569 static inline void write_back_intra_pred_mode(H264Context *h){
 570     const int mb_xy= h->mb_xy;
 571
 572     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 573     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 574     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 575     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 576     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 577     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 578     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 579 }
 580
 581 /**
 582  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 583  */
 584 static inline int check_intra4x4_pred_mode(H264Context *h){
 585     MpegEncContext * const s = &h->s;
 586     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 587     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 588     int i;
 589
 590     if(!(h->top_samples_available&0x8000)){
 591         for(i=0; i<4; i++){
 592             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 598             }
 599         }
 600     }
 601
 602     if((h->left_samples_available&0x8888)!=0x8888){
 603         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 604         for(i=0; i<4; i++){
 605             if(!(h->left_samples_available&mask[i])){
 606             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 607             if(status<0){
 608                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 609                 return -1;
 610             } else if(status){
 611                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 612             }
 613             }
 614         }
 615     }
 616
 617     return 0;
 618 } //FIXME cleanup like next
 619
 620 /**
 621  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 622  */
 623 static inline int check_intra_pred_mode(H264Context *h, int mode){
 624     MpegEncContext * const s = &h->s;
 625     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 626     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 627
 628     if(mode > 6U) {
 629         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 630         return -1;
 631     }
 632
 633     if(!(h->top_samples_available&0x8000)){
 634         mode= top[ mode ];
 635         if(mode<0){
 636             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 637             return -1;
 638         }
 639     }
 640
 641     if((h->left_samples_available&0x8080) != 0x8080){
 642         mode= left[ mode ];
 643         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 644             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 645         }
 646         if(mode<0){
 647             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 648             return -1;
 649         }
 650     }
 651
 652     return mode;
 653 }
 654
 655 /**
 656  * gets the predicted intra4x4 prediction mode.
 657  */
 658 static inline int pred_intra_mode(H264Context *h, int n){
 659     const int index8= scan8[n];
 660     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 661     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 662     const int min= FFMIN(left, top);
 663
 664     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 665
 666     if(min<0) return DC_PRED;
 667     else      return min;
 668 }
 669
 670 static inline void write_back_non_zero_count(H264Context *h){
 671     const int mb_xy= h->mb_xy;
 672
 673     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 674     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 675     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 676     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 677     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 678     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 679     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 680
 681     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 682     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 683     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 684
 685     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 686     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 687     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 688 }
 689
 690 /**
 691  * gets the predicted number of non-zero coefficients.
 692  * @param n block index
 693  */
 694 static inline int pred_non_zero_count(H264Context *h, int n){
 695     const int index8= scan8[n];
 696     const int left= h->non_zero_count_cache[index8 - 1];
 697     const int top = h->non_zero_count_cache[index8 - 8];
 698     int i= left + top;
 699
 700     if(i<64) i= (i+1)>>1;
 701
 702     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 703
 704     return i&31;
 705 }
 706
 707 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 708     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 709     MpegEncContext *s = &h->s;
 710
 711     /* there is no consistent mapping of mvs to neighboring locations that will
 712      * make mbaff happy, so we can't move all this logic to fill_caches */
 713     if(FRAME_MBAFF){
 714         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 715         const int16_t *mv;
 716         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 717         *C = h->mv_cache[list][scan8[0]-2];
 718
 719         if(!MB_FIELD
 720            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 721             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 722             if(IS_INTERLACED(mb_types[topright_xy])){
 723 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 724                 const int x4 = X4, y4 = Y4;\
 725                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 726                 if(!USES_LIST(mb_type,list))\
 727                     return LIST_NOT_USED;\
 728                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 729                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 730                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 731                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 732
 733                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 734             }
 735         }
 736         if(topright_ref == PART_NOT_AVAILABLE
 737            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 738            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 739             if(!MB_FIELD
 740                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 741                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 742             }
 743             if(MB_FIELD
 744                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 745                && i >= scan8[0]+8){
 746                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 747                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 748             }
 749         }
 750 #undef SET_DIAG_MV
 751     }
 752
 753     if(topright_ref != PART_NOT_AVAILABLE){
 754         *C= h->mv_cache[list][ i - 8 + part_width ];
 755         return topright_ref;
 756     }else{
 757         tprintf(s->avctx, "topright MV not available\n");
 758
 759         *C= h->mv_cache[list][ i - 8 - 1 ];
 760         return h->ref_cache[list][ i - 8 - 1 ];
 761     }
 762 }
 763
 764 /**
 765  * gets the predicted MV.
 766  * @param n the block index
 767  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 768  * @param mx the x component of the predicted motion vector
 769  * @param my the y component of the predicted motion vector
 770  */
 771 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 772     const int index8= scan8[n];
 773     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 774     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 775     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 776     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 777     const int16_t * C;
 778     int diagonal_ref, match_count;
 779
 780     assert(part_width==1 || part_width==2 || part_width==4);
 781
 782 /* mv_cache
 783   B . . A T T T T
 784   U . . L . . , .
 785   U . . L . . . .
 786   U . . L . . , .
 787   . . . L . . . .
 788 */
 789
 790     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 791     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 792     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 793     if(match_count > 1){ //most common
 794         *mx= mid_pred(A[0], B[0], C[0]);
 795         *my= mid_pred(A[1], B[1], C[1]);
 796     }else if(match_count==1){
 797         if(left_ref==ref){
 798             *mx= A[0];
 799             *my= A[1];
 800         }else if(top_ref==ref){
 801             *mx= B[0];
 802             *my= B[1];
 803         }else{
 804             *mx= C[0];
 805             *my= C[1];
 806         }
 807     }else{
 808         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 809             *mx= A[0];
 810             *my= A[1];
 811         }else{
 812             *mx= mid_pred(A[0], B[0], C[0]);
 813             *my= mid_pred(A[1], B[1], C[1]);
 814         }
 815     }
 816
 817     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 818 }
 819
 820 /**
 821  * gets the directionally predicted 16x8 MV.
 822  * @param n the block index
 823  * @param mx the x component of the predicted motion vector
 824  * @param my the y component of the predicted motion vector
 825  */
 826 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 827     if(n==0){
 828         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 829         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 830
 831         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 832
 833         if(top_ref == ref){
 834             *mx= B[0];
 835             *my= B[1];
 836             return;
 837         }
 838     }else{
 839         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 840         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 841
 842         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 843
 844         if(left_ref == ref){
 845             *mx= A[0];
 846             *my= A[1];
 847             return;
 848         }
 849     }
 850
 851     //RARE
 852     pred_motion(h, n, 4, list, ref, mx, my);
 853 }
 854
 855 /**
 856  * gets the directionally predicted 8x16 MV.
 857  * @param n the block index
 858  * @param mx the x component of the predicted motion vector
 859  * @param my the y component of the predicted motion vector
 860  */
 861 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 862     if(n==0){
 863         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 864         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 865
 866         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 867
 868         if(left_ref == ref){
 869             *mx= A[0];
 870             *my= A[1];
 871             return;
 872         }
 873     }else{
 874         const int16_t * C;
 875         int diagonal_ref;
 876
 877         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 878
 879         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 880
 881         if(diagonal_ref == ref){
 882             *mx= C[0];
 883             *my= C[1];
 884             return;
 885         }
 886     }
 887
 888     //RARE
 889     pred_motion(h, n, 2, list, ref, mx, my);
 890 }
 891
 892 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 893     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 894     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 895
 896     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 897
 898     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 899        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 900        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 901
 902         *mx = *my = 0;
 903         return;
 904     }
 905
 906     pred_motion(h, 0, 4, 0, 0, mx, my);
 907
 908     return;
 909 }
 910
 911 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 912     int poc0 = h->ref_list[0][i].poc;
 913     int td = av_clip(poc1 - poc0, -128, 127);
 914     if(td == 0 || h->ref_list[0][i].long_ref){
 915         return 256;
 916     }else{
 917         int tb = av_clip(poc - poc0, -128, 127);
 918         int tx = (16384 + (FFABS(td) >> 1)) / td;
 919         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 920     }
 921 }
 922
 923 static inline void direct_dist_scale_factor(H264Context * const h){
 924     MpegEncContext * const s = &h->s;
 925     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 926     const int poc1 = h->ref_list[1][0].poc;
 927     int i, field;
 928     for(field=0; field<2; field++){
 929         const int poc  = h->s.current_picture_ptr->field_poc[field];
 930         const int poc1 = h->ref_list[1][0].field_poc[field];
 931         for(i=0; i < 2*h->ref_count[0]; i++)
 932             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 933     }
 934
 935     for(i=0; i<h->ref_count[0]; i++){
 936         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 937     }
 938 }
 939
 940 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 941     MpegEncContext * const s = &h->s;
 942     Picture * const ref1 = &h->ref_list[1][0];
 943     int j, old_ref, rfield;
 944     int start= mbafi ? 16                      : 0;
 945     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 946     int interl= mbafi || s->picture_structure != PICT_FRAME;
 947
 948     /* bogus; fills in for missing frames */
 949     memset(map[list], 0, sizeof(map[list]));
 950
 951     for(rfield=0; rfield<2; rfield++){
 952         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 953             int poc = ref1->ref_poc[colfield][list][old_ref];
 954
 955             if     (!interl)
 956                 poc |= 3;
 957             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 958                 poc= (poc&~3) + rfield + 1;
 959
 960             for(j=start; j<end; j++){
 961                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 962                     int cur_ref= mbafi ? (j-16)^field : j;
 963                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 964                     if(rfield == field)
 965                         map[list][old_ref] = cur_ref;
 966                     break;
 967                 }
 968             }
 969         }
 970     }
 971 }
 972
 973 static inline void direct_ref_list_init(H264Context * const h){
 974     MpegEncContext * const s = &h->s;
 975     Picture * const ref1 = &h->ref_list[1][0];
 976     Picture * const cur = s->current_picture_ptr;
 977     int list, j, field;
 978     int sidx= (s->picture_structure&1)^1;
 979     int ref1sidx= (ref1->reference&1)^1;
 980
 981     for(list=0; list<2; list++){
 982         cur->ref_count[sidx][list] = h->ref_count[list];
 983         for(j=0; j<h->ref_count[list]; j++)
 984             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 985     }
 986
 987     if(s->picture_structure == PICT_FRAME){
 988         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 989         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 990     }
 991
 992     cur->mbaff= FRAME_MBAFF;
 993
 994     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 995         return;
 996
 997     for(list=0; list<2; list++){
 998         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 999         for(field=0; field<2; field++)
1000             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
1001     }
1002 }
1003
1004 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1005     MpegEncContext * const s = &h->s;
1006     int b8_stride = h->b8_stride;
1007     int b4_stride = h->b_stride;
1008     int mb_xy = h->mb_xy;
1009     int mb_type_col[2];
1010     const int16_t (*l1mv0)[2], (*l1mv1)[2];
1011     const int8_t *l1ref0, *l1ref1;
1012     const int is_b8x8 = IS_8X8(*mb_type);
1013     unsigned int sub_mb_type;
1014     int i8, i4;
1015
1016 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1017
1018     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1019         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1020             int cur_poc = s->current_picture_ptr->poc;
1021             int *col_poc = h->ref_list[1]->field_poc;
1022             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1023             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1024             b8_stride = 0;
1025         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1026             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1027             mb_xy += s->mb_stride*fieldoff;
1028         }
1029         goto single_col;
1030     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1031         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1032             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1033             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1034             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1035             b8_stride *= 3;
1036             b4_stride *= 6;
1037             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1038             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1039                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1040                 && !is_b8x8){
1041                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1042                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1043             }else{
1044                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1045                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1046             }
1047         }else{                                           //     AFR/FR    -> AFR/FR
1048 single_col:
1049             mb_type_col[0] =
1050             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1051             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1052                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1053                 * so we know exactly what block size to use */
1054                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1055                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1056             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1057                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1058                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1059             }else{
1060                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1061                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1062             }
1063         }
1064     }
1065
1066     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1067     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1068     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1069     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1070     if(!b8_stride){
1071         if(s->mb_y&1){
1072             l1ref0 += h->b8_stride;
1073             l1ref1 += h->b8_stride;
1074             l1mv0  +=  2*b4_stride;
1075             l1mv1  +=  2*b4_stride;
1076         }
1077     }
1078
1079     if(h->direct_spatial_mv_pred){
1080         int ref[2];
1081         int mv[2][2];
1082         int list;
1083
1084         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1085
1086         /* ref = min(neighbors) */
1087         for(list=0; list<2; list++){
1088             int refa = h->ref_cache[list][scan8[0] - 1];
1089             int refb = h->ref_cache[list][scan8[0] - 8];
1090             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1091             if(refc == PART_NOT_AVAILABLE)
1092                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1093             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1094             if(ref[list] < 0)
1095                 ref[list] = -1;
1096         }
1097
1098         if(ref[0] < 0 && ref[1] < 0){
1099             ref[0] = ref[1] = 0;
1100             mv[0][0] = mv[0][1] =
1101             mv[1][0] = mv[1][1] = 0;
1102         }else{
1103             for(list=0; list<2; list++){
1104                 if(ref[list] >= 0)
1105                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1106                 else
1107                     mv[list][0] = mv[list][1] = 0;
1108             }
1109         }
1110
1111         if(ref[1] < 0){
1112             if(!is_b8x8)
1113                 *mb_type &= ~MB_TYPE_L1;
1114             sub_mb_type &= ~MB_TYPE_L1;
1115         }else if(ref[0] < 0){
1116             if(!is_b8x8)
1117                 *mb_type &= ~MB_TYPE_L0;
1118             sub_mb_type &= ~MB_TYPE_L0;
1119         }
1120
1121         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1122             for(i8=0; i8<4; i8++){
1123                 int x8 = i8&1;
1124                 int y8 = i8>>1;
1125                 int xy8 = x8+y8*b8_stride;
1126                 int xy4 = 3*x8+y8*b4_stride;
1127                 int a=0, b=0;
1128
1129                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1130                     continue;
1131                 h->sub_mb_type[i8] = sub_mb_type;
1132
1133                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1134                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1135                 if(!IS_INTRA(mb_type_col[y8])
1136                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1137                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1138                     if(ref[0] > 0)
1139                         a= pack16to32(mv[0][0],mv[0][1]);
1140                     if(ref[1] > 0)
1141                         b= pack16to32(mv[1][0],mv[1][1]);
1142                 }else{
1143                     a= pack16to32(mv[0][0],mv[0][1]);
1144                     b= pack16to32(mv[1][0],mv[1][1]);
1145                 }
1146                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1147                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1148             }
1149         }else if(IS_16X16(*mb_type)){
1150             int a=0, b=0;
1151
1152             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1153             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1154             if(!IS_INTRA(mb_type_col[0])
1155                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1156                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1157                        && (h->x264_build>33 || !h->x264_build)))){
1158                 if(ref[0] > 0)
1159                     a= pack16to32(mv[0][0],mv[0][1]);
1160                 if(ref[1] > 0)
1161                     b= pack16to32(mv[1][0],mv[1][1]);
1162             }else{
1163                 a= pack16to32(mv[0][0],mv[0][1]);
1164                 b= pack16to32(mv[1][0],mv[1][1]);
1165             }
1166             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1167             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1168         }else{
1169             for(i8=0; i8<4; i8++){
1170                 const int x8 = i8&1;
1171                 const int y8 = i8>>1;
1172
1173                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1174                     continue;
1175                 h->sub_mb_type[i8] = sub_mb_type;
1176
1177                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1178                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1179                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1180                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1181
1182                 /* col_zero_flag */
1183                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1184                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1185                                                   && (h->x264_build>33 || !h->x264_build)))){
1186                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1187                     if(IS_SUB_8X8(sub_mb_type)){
1188                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1189                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1190                             if(ref[0] == 0)
1191                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1192                             if(ref[1] == 0)
1193                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1194                         }
1195                     }else
1196                     for(i4=0; i4<4; i4++){
1197                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1198                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1199                             if(ref[0] == 0)
1200                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1201                             if(ref[1] == 0)
1202                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1203                         }
1204                     }
1205                 }
1206             }
1207         }
1208     }else{ /* direct temporal mv pred */
1209         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1210         const int *dist_scale_factor = h->dist_scale_factor;
1211         int ref_offset= 0;
1212
1213         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1214             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1215             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1216             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1217         }
1218         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1219             ref_offset += 16;
1220
1221         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1222             /* FIXME assumes direct_8x8_inference == 1 */
1223             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1224
1225             for(i8=0; i8<4; i8++){
1226                 const int x8 = i8&1;
1227                 const int y8 = i8>>1;
1228                 int ref0, scale;
1229                 const int16_t (*l1mv)[2]= l1mv0;
1230
1231                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1232                     continue;
1233                 h->sub_mb_type[i8] = sub_mb_type;
1234
1235                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1236                 if(IS_INTRA(mb_type_col[y8])){
1237                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1238                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1239                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1240                     continue;
1241                 }
1242
1243                 ref0 = l1ref0[x8 + y8*b8_stride];
1244                 if(ref0 >= 0)
1245                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1246                 else{
1247                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1248                     l1mv= l1mv1;
1249                 }
1250                 scale = dist_scale_factor[ref0];
1251                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1252
1253                 {
1254                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1255                     int my_col = (mv_col[1]<<y_shift)/2;
1256                     int mx = (scale * mv_col[0] + 128) >> 8;
1257                     int my = (scale * my_col + 128) >> 8;
1258                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1259                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1260                 }
1261             }
1262             return;
1263         }
1264
1265         /* one-to-one mv scaling */
1266
1267         if(IS_16X16(*mb_type)){
1268             int ref, mv0, mv1;
1269
1270             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1271             if(IS_INTRA(mb_type_col[0])){
1272                 ref=mv0=mv1=0;
1273             }else{
1274                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1275                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1276                 const int scale = dist_scale_factor[ref0];
1277                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1278                 int mv_l0[2];
1279                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1280                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1281                 ref= ref0;
1282                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1283                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1284             }
1285             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1286             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1287             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1288         }else{
1289             for(i8=0; i8<4; i8++){
1290                 const int x8 = i8&1;
1291                 const int y8 = i8>>1;
1292                 int ref0, scale;
1293                 const int16_t (*l1mv)[2]= l1mv0;
1294
1295                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1296                     continue;
1297                 h->sub_mb_type[i8] = sub_mb_type;
1298                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1299                 if(IS_INTRA(mb_type_col[0])){
1300                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1301                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1302                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1303                     continue;
1304                 }
1305
1306                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1307                 if(ref0 >= 0)
1308                     ref0 = map_col_to_list0[0][ref0];
1309                 else{
1310                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1311                     l1mv= l1mv1;
1312                 }
1313                 scale = dist_scale_factor[ref0];
1314
1315                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1316                 if(IS_SUB_8X8(sub_mb_type)){
1317                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1318                     int mx = (scale * mv_col[0] + 128) >> 8;
1319                     int my = (scale * mv_col[1] + 128) >> 8;
1320                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1321                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1322                 }else
1323                 for(i4=0; i4<4; i4++){
1324                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1325                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1326                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1327                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1328                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1329                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1330                 }
1331             }
1332         }
1333     }
1334 }
1335
1336 static inline void write_back_motion(H264Context *h, int mb_type){
1337     MpegEncContext * const s = &h->s;
1338     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1339     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1340     int list;
1341
1342     if(!USES_LIST(mb_type, 0))
1343         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1344
1345     for(list=0; list<h->list_count; list++){
1346         int y;
1347         if(!USES_LIST(mb_type, list))
1348             continue;
1349
1350         for(y=0; y<4; y++){
1351             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1352             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1353         }
1354         if( h->pps.cabac ) {
1355             if(IS_SKIP(mb_type))
1356                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1357             else
1358             for(y=0; y<4; y++){
1359                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1360                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1361             }
1362         }
1363
1364         {
1365             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1366             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1367             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1368             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1369             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1370         }
1371     }
1372
1373     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1374         if(IS_8X8(mb_type)){
1375             uint8_t *direct_table = &h->direct_table[b8_xy];
1376             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1377             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1378             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1379         }
1380     }
1381 }
1382
1383 /**
1384  * Decodes a network abstraction layer unit.
1385  * @param consumed is the number of bytes used as input
1386  * @param length is the length of the array
1387  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1388  * @returns decoded bytes, might be src+1 if no escapes
1389  */
1390 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1391     int i, si, di;
1392     uint8_t *dst;
1393     int bufidx;
1394
1395 //    src[0]&0x80;                //forbidden bit
1396     h->nal_ref_idc= src[0]>>5;
1397     h->nal_unit_type= src[0]&0x1F;
1398
1399     src++; length--;
1400 #if 0
1401     for(i=0; i<length; i++)
1402         printf("%2X ", src[i]);
1403 #endif
1404     for(i=0; i+1<length; i+=2){
1405         if(src[i]) continue;
1406         if(i>0 && src[i-1]==0) i--;
1407         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1408             if(src[i+2]!=3){
1409                 /* startcode, so we must be past the end */
1410                 length=i;
1411             }
1412             break;
1413         }
1414     }
1415
1416     if(i>=length-1){ //no escaped 0
1417         *dst_length= length;
1418         *consumed= length+1; //+1 for the header
1419         return src;
1420     }
1421
1422     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1423     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1424     dst= h->rbsp_buffer[bufidx];
1425
1426     if (dst == NULL){
1427         return NULL;
1428     }
1429
1430 //printf("decoding esc\n");
1431     si=di=0;
1432     while(si<length){
1433         //remove escapes (very rare 1:2^22)
1434         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1435             if(src[si+2]==3){ //escape
1436                 dst[di++]= 0;
1437                 dst[di++]= 0;
1438                 si+=3;
1439                 continue;
1440             }else //next start code
1441                 break;
1442         }
1443
1444         dst[di++]= src[si++];
1445     }
1446
1447     *dst_length= di;
1448     *consumed= si + 1;//+1 for the header
1449 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1450     return dst;
1451 }
1452
1453 /**
1454  * identifies the exact end of the bitstream
1455  * @return the length of the trailing, or 0 if damaged
1456  */
1457 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1458     int v= *src;
1459     int r;
1460
1461     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1462
1463     for(r=1; r<9; r++){
1464         if(v&1) return r;
1465         v>>=1;
1466     }
1467     return 0;
1468 }
1469
1470 /**
1471  * IDCT transforms the 16 dc values and dequantizes them.
1472  * @param qp quantization parameter
1473  */
1474 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1475 #define stride 16
1476     int i;
1477     int temp[16]; //FIXME check if this is a good idea
1478     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1479     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1480
1481 //memset(block, 64, 2*256);
1482 //return;
1483     for(i=0; i<4; i++){
1484         const int offset= y_offset[i];
1485         const int z0= block[offset+stride*0] + block[offset+stride*4];
1486         const int z1= block[offset+stride*0] - block[offset+stride*4];
1487         const int z2= block[offset+stride*1] - block[offset+stride*5];
1488         const int z3= block[offset+stride*1] + block[offset+stride*5];
1489
1490         temp[4*i+0]= z0+z3;
1491         temp[4*i+1]= z1+z2;
1492         temp[4*i+2]= z1-z2;
1493         temp[4*i+3]= z0-z3;
1494     }
1495
1496     for(i=0; i<4; i++){
1497         const int offset= x_offset[i];
1498         const int z0= temp[4*0+i] + temp[4*2+i];
1499         const int z1= temp[4*0+i] - temp[4*2+i];
1500         const int z2= temp[4*1+i] - temp[4*3+i];
1501         const int z3= temp[4*1+i] + temp[4*3+i];
1502
1503         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1504         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1505         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1506         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1507     }
1508 }
1509
1510 #if 0
1511 /**
1512  * DCT transforms the 16 dc values.
1513  * @param qp quantization parameter ??? FIXME
1514  */
1515 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1516 //    const int qmul= dequant_coeff[qp][0];
1517     int i;
1518     int temp[16]; //FIXME check if this is a good idea
1519     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1520     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1521
1522     for(i=0; i<4; i++){
1523         const int offset= y_offset[i];
1524         const int z0= block[offset+stride*0] + block[offset+stride*4];
1525         const int z1= block[offset+stride*0] - block[offset+stride*4];
1526         const int z2= block[offset+stride*1] - block[offset+stride*5];
1527         const int z3= block[offset+stride*1] + block[offset+stride*5];
1528
1529         temp[4*i+0]= z0+z3;
1530         temp[4*i+1]= z1+z2;
1531         temp[4*i+2]= z1-z2;
1532         temp[4*i+3]= z0-z3;
1533     }
1534
1535     for(i=0; i<4; i++){
1536         const int offset= x_offset[i];
1537         const int z0= temp[4*0+i] + temp[4*2+i];
1538         const int z1= temp[4*0+i] - temp[4*2+i];
1539         const int z2= temp[4*1+i] - temp[4*3+i];
1540         const int z3= temp[4*1+i] + temp[4*3+i];
1541
1542         block[stride*0 +offset]= (z0 + z3)>>1;
1543         block[stride*2 +offset]= (z1 + z2)>>1;
1544         block[stride*8 +offset]= (z1 - z2)>>1;
1545         block[stride*10+offset]= (z0 - z3)>>1;
1546     }
1547 }
1548 #endif
1549
1550 #undef xStride
1551 #undef stride
1552
1553 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1554     const int stride= 16*2;
1555     const int xStride= 16;
1556     int a,b,c,d,e;
1557
1558     a= block[stride*0 + xStride*0];
1559     b= block[stride*0 + xStride*1];
1560     c= block[stride*1 + xStride*0];
1561     d= block[stride*1 + xStride*1];
1562
1563     e= a-b;
1564     a= a+b;
1565     b= c-d;
1566     c= c+d;
1567
1568     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1569     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1570     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1571     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1572 }
1573
1574 #if 0
1575 static void chroma_dc_dct_c(DCTELEM *block){
1576     const int stride= 16*2;
1577     const int xStride= 16;
1578     int a,b,c,d,e;
1579
1580     a= block[stride*0 + xStride*0];
1581     b= block[stride*0 + xStride*1];
1582     c= block[stride*1 + xStride*0];
1583     d= block[stride*1 + xStride*1];
1584
1585     e= a-b;
1586     a= a+b;
1587     b= c-d;
1588     c= c+d;
1589
1590     block[stride*0 + xStride*0]= (a+c);
1591     block[stride*0 + xStride*1]= (e+b);
1592     block[stride*1 + xStride*0]= (a-c);
1593     block[stride*1 + xStride*1]= (e-b);
1594 }
1595 #endif
1596
1597 /**
1598  * gets the chroma qp.
1599  */
1600 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1601     return h->pps.chroma_qp_table[t][qscale];
1602 }
1603
1604 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1605 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1606 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1607     int i;
1608     const int * const quant_table= quant_coeff[qscale];
1609     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1610     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1611     const unsigned int threshold2= (threshold1<<1);
1612     int last_non_zero;
1613
1614     if(separate_dc){
1615         if(qscale<=18){
1616             //avoid overflows
1617             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1618             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1619             const unsigned int dc_threshold2= (dc_threshold1<<1);
1620
1621             int level= block[0]*quant_coeff[qscale+18][0];
1622             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1623                 if(level>0){
1624                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1625                     block[0]= level;
1626                 }else{
1627                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1628                     block[0]= -level;
1629                 }
1630 //                last_non_zero = i;
1631             }else{
1632                 block[0]=0;
1633             }
1634         }else{
1635             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1636             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1637             const unsigned int dc_threshold2= (dc_threshold1<<1);
1638
1639             int level= block[0]*quant_table[0];
1640             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1641                 if(level>0){
1642                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1643                     block[0]= level;
1644                 }else{
1645                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1646                     block[0]= -level;
1647                 }
1648 //                last_non_zero = i;
1649             }else{
1650                 block[0]=0;
1651             }
1652         }
1653         last_non_zero= 0;
1654         i=1;
1655     }else{
1656         last_non_zero= -1;
1657         i=0;
1658     }
1659
1660     for(; i<16; i++){
1661         const int j= scantable[i];
1662         int level= block[j]*quant_table[j];
1663
1664 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1665 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1666         if(((unsigned)(level+threshold1))>threshold2){
1667             if(level>0){
1668                 level= (bias + level)>>QUANT_SHIFT;
1669                 block[j]= level;
1670             }else{
1671                 level= (bias - level)>>QUANT_SHIFT;
1672                 block[j]= -level;
1673             }
1674             last_non_zero = i;
1675         }else{
1676             block[j]=0;
1677         }
1678     }
1679
1680     return last_non_zero;
1681 }
1682
1683 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1684                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1685                            int src_x_offset, int src_y_offset,
1686                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1687     MpegEncContext * const s = &h->s;
1688     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1689     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1690     const int luma_xy= (mx&3) + ((my&3)<<2);
1691     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1692     uint8_t * src_cb, * src_cr;
1693     int extra_width= h->emu_edge_width;
1694     int extra_height= h->emu_edge_height;
1695     int emu=0;
1696     const int full_mx= mx>>2;
1697     const int full_my= my>>2;
1698     const int pic_width  = 16*s->mb_width;
1699     const int pic_height = 16*s->mb_height >> MB_FIELD;
1700
1701     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1702         return;
1703
1704     if(mx&7) extra_width -= 3;
1705     if(my&7) extra_height -= 3;
1706
1707     if(   full_mx < 0-extra_width
1708        || full_my < 0-extra_height
1709        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1710        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1711         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1712             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1713         emu=1;
1714     }
1715
1716     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1717     if(!square){
1718         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1719     }
1720
1721     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1722
1723     if(MB_FIELD){
1724         // chroma offset when predicting from a field of opposite parity
1725         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1726         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1727     }
1728     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1729     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1730
1731     if(emu){
1732         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1733             src_cb= s->edge_emu_buffer;
1734     }
1735     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1736
1737     if(emu){
1738         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1739             src_cr= s->edge_emu_buffer;
1740     }
1741     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1742 }
1743
1744 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1745                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1746                            int x_offset, int y_offset,
1747                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1748                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1749                            int list0, int list1){
1750     MpegEncContext * const s = &h->s;
1751     qpel_mc_func *qpix_op=  qpix_put;
1752     h264_chroma_mc_func chroma_op= chroma_put;
1753
1754     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1755     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1756     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1757     x_offset += 8*s->mb_x;
1758     y_offset += 8*(s->mb_y >> MB_FIELD);
1759
1760     if(list0){
1761         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1762         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1763                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1764                            qpix_op, chroma_op);
1765
1766         qpix_op=  qpix_avg;
1767         chroma_op= chroma_avg;
1768     }
1769
1770     if(list1){
1771         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1772         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1773                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1774                            qpix_op, chroma_op);
1775     }
1776 }
1777
1778 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1779                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1780                            int x_offset, int y_offset,
1781                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1782                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1783                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1784                            int list0, int list1){
1785     MpegEncContext * const s = &h->s;
1786
1787     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1788     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1789     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1790     x_offset += 8*s->mb_x;
1791     y_offset += 8*(s->mb_y >> MB_FIELD);
1792
1793     if(list0 && list1){
1794         /* don't optimize for luma-only case, since B-frames usually
1795          * use implicit weights => chroma too. */
1796         uint8_t *tmp_cb = s->obmc_scratchpad;
1797         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1798         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1799         int refn0 = h->ref_cache[0][ scan8[n] ];
1800         int refn1 = h->ref_cache[1][ scan8[n] ];
1801
1802         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1803                     dest_y, dest_cb, dest_cr,
1804                     x_offset, y_offset, qpix_put, chroma_put);
1805         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1806                     tmp_y, tmp_cb, tmp_cr,
1807                     x_offset, y_offset, qpix_put, chroma_put);
1808
1809         if(h->use_weight == 2){
1810             int weight0 = h->implicit_weight[refn0][refn1];
1811             int weight1 = 64 - weight0;
1812             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1813             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1814             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1815         }else{
1816             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1817                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1818                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1819             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1820                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1821                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1822             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1823                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1824                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1825         }
1826     }else{
1827         int list = list1 ? 1 : 0;
1828         int refn = h->ref_cache[list][ scan8[n] ];
1829         Picture *ref= &h->ref_list[list][refn];
1830         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1831                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1832                     qpix_put, chroma_put);
1833
1834         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1835                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1836         if(h->use_weight_chroma){
1837             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1838                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1839             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1840                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1841         }
1842     }
1843 }
1844
1845 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1846                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1847                            int x_offset, int y_offset,
1848                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1849                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1850                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1851                            int list0, int list1){
1852     if((h->use_weight==2 && list0 && list1
1853         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1854        || h->use_weight==1)
1855         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1856                          x_offset, y_offset, qpix_put, chroma_put,
1857                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1858     else
1859         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1860                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1861 }
1862
1863 static inline void prefetch_motion(H264Context *h, int list){
1864     /* fetch pixels for estimated mv 4 macroblocks ahead
1865      * optimized for 64byte cache lines */
1866     MpegEncContext * const s = &h->s;
1867     const int refn = h->ref_cache[list][scan8[0]];
1868     if(refn >= 0){
1869         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1870         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1871         uint8_t **src= h->ref_list[list][refn].data;
1872         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1873         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1874         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1875         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1876     }
1877 }
1878
1879 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1880                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1881                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1882                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1883     MpegEncContext * const s = &h->s;
1884     const int mb_xy= h->mb_xy;
1885     const int mb_type= s->current_picture.mb_type[mb_xy];
1886
1887     assert(IS_INTER(mb_type));
1888
1889     prefetch_motion(h, 0);
1890
1891     if(IS_16X16(mb_type)){
1892         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1893                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1894                 &weight_op[0], &weight_avg[0],
1895                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1896     }else if(IS_16X8(mb_type)){
1897         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1898                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1899                 &weight_op[1], &weight_avg[1],
1900                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1901         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1902                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1903                 &weight_op[1], &weight_avg[1],
1904                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1905     }else if(IS_8X16(mb_type)){
1906         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1907                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1908                 &weight_op[2], &weight_avg[2],
1909                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1910         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1911                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1912                 &weight_op[2], &weight_avg[2],
1913                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1914     }else{
1915         int i;
1916
1917         assert(IS_8X8(mb_type));
1918
1919         for(i=0; i<4; i++){
1920             const int sub_mb_type= h->sub_mb_type[i];
1921             const int n= 4*i;
1922             int x_offset= (i&1)<<2;
1923             int y_offset= (i&2)<<1;
1924
1925             if(IS_SUB_8X8(sub_mb_type)){
1926                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1927                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1928                     &weight_op[3], &weight_avg[3],
1929                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1930             }else if(IS_SUB_8X4(sub_mb_type)){
1931                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1932                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1933                     &weight_op[4], &weight_avg[4],
1934                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1935                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1936                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1937                     &weight_op[4], &weight_avg[4],
1938                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1939             }else if(IS_SUB_4X8(sub_mb_type)){
1940                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1941                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1942                     &weight_op[5], &weight_avg[5],
1943                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1944                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1945                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1946                     &weight_op[5], &weight_avg[5],
1947                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1948             }else{
1949                 int j;
1950                 assert(IS_SUB_4X4(sub_mb_type));
1951                 for(j=0; j<4; j++){
1952                     int sub_x_offset= x_offset + 2*(j&1);
1953                     int sub_y_offset= y_offset +   (j&2);
1954                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1955                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1956                         &weight_op[6], &weight_avg[6],
1957                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1958                 }
1959             }
1960         }
1961     }
1962
1963     prefetch_motion(h, 1);
1964 }
1965
1966 static av_cold void decode_init_vlc(void){
1967     static int done = 0;
1968
1969     if (!done) {
1970         int i;
1971         int offset;
1972         done = 1;
1973
1974         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1975         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1976         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1977                  &chroma_dc_coeff_token_len [0], 1, 1,
1978                  &chroma_dc_coeff_token_bits[0], 1, 1,
1979                  INIT_VLC_USE_NEW_STATIC);
1980
1981         offset = 0;
1982         for(i=0; i<4; i++){
1983             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1984             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1985             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1986                      &coeff_token_len [i][0], 1, 1,
1987                      &coeff_token_bits[i][0], 1, 1,
1988                      INIT_VLC_USE_NEW_STATIC);
1989             offset += coeff_token_vlc_tables_size[i];
1990         }
1991         /*
1992          * This is a one time safety check to make sure that
1993          * the packed static coeff_token_vlc table sizes
1994          * were initialized correctly.
1995          */
1996         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1997
1998         for(i=0; i<3; i++){
1999             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
2000             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
2001             init_vlc(&chroma_dc_total_zeros_vlc[i],
2002                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2003                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2004                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
2005                      INIT_VLC_USE_NEW_STATIC);
2006         }
2007         for(i=0; i<15; i++){
2008             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
2009             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
2010             init_vlc(&total_zeros_vlc[i],
2011                      TOTAL_ZEROS_VLC_BITS, 16,
2012                      &total_zeros_len [i][0], 1, 1,
2013                      &total_zeros_bits[i][0], 1, 1,
2014                      INIT_VLC_USE_NEW_STATIC);
2015         }
2016
2017         for(i=0; i<6; i++){
2018             run_vlc[i].table = run_vlc_tables[i];
2019             run_vlc[i].table_allocated = run_vlc_tables_size;
2020             init_vlc(&run_vlc[i],
2021                      RUN_VLC_BITS, 7,
2022                      &run_len [i][0], 1, 1,
2023                      &run_bits[i][0], 1, 1,
2024                      INIT_VLC_USE_NEW_STATIC);
2025         }
2026         run7_vlc.table = run7_vlc_table,
2027         run7_vlc.table_allocated = run7_vlc_table_size;
2028         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2029                  &run_len [6][0], 1, 1,
2030                  &run_bits[6][0], 1, 1,
2031                  INIT_VLC_USE_NEW_STATIC);
2032     }
2033 }
2034
2035 static void free_tables(H264Context *h){
2036     int i;
2037     H264Context *hx;
2038     av_freep(&h->intra4x4_pred_mode);
2039     av_freep(&h->chroma_pred_mode_table);
2040     av_freep(&h->cbp_table);
2041     av_freep(&h->mvd_table[0]);
2042     av_freep(&h->mvd_table[1]);
2043     av_freep(&h->direct_table);
2044     av_freep(&h->non_zero_count);
2045     av_freep(&h->slice_table_base);
2046     h->slice_table= NULL;
2047
2048     av_freep(&h->mb2b_xy);
2049     av_freep(&h->mb2b8_xy);
2050
2051     for(i = 0; i < h->s.avctx->thread_count; i++) {
2052         hx = h->thread_context[i];
2053         if(!hx) continue;
2054         av_freep(&hx->top_borders[1]);
2055         av_freep(&hx->top_borders[0]);
2056         av_freep(&hx->s.obmc_scratchpad);
2057     }
2058 }
2059
2060 static void init_dequant8_coeff_table(H264Context *h){
2061     int i,q,x;
2062     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2063     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2064     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2065
2066     for(i=0; i<2; i++ ){
2067         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2068             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2069             break;
2070         }
2071
2072         for(q=0; q<52; q++){
2073             int shift = div6[q];
2074             int idx = rem6[q];
2075             for(x=0; x<64; x++)
2076                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2077                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2078                     h->pps.scaling_matrix8[i][x]) << shift;
2079         }
2080     }
2081 }
2082
2083 static void init_dequant4_coeff_table(H264Context *h){
2084     int i,j,q,x;
2085     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2086     for(i=0; i<6; i++ ){
2087         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2088         for(j=0; j<i; j++){
2089             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2090                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2091                 break;
2092             }
2093         }
2094         if(j<i)
2095             continue;
2096
2097         for(q=0; q<52; q++){
2098             int shift = div6[q] + 2;
2099             int idx = rem6[q];
2100             for(x=0; x<16; x++)
2101                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2102                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2103                     h->pps.scaling_matrix4[i][x]) << shift;
2104         }
2105     }
2106 }
2107
2108 static void init_dequant_tables(H264Context *h){
2109     int i,x;
2110     init_dequant4_coeff_table(h);
2111     if(h->pps.transform_8x8_mode)
2112         init_dequant8_coeff_table(h);
2113     if(h->sps.transform_bypass){
2114         for(i=0; i<6; i++)
2115             for(x=0; x<16; x++)
2116                 h->dequant4_coeff[i][0][x] = 1<<6;
2117         if(h->pps.transform_8x8_mode)
2118             for(i=0; i<2; i++)
2119                 for(x=0; x<64; x++)
2120                     h->dequant8_coeff[i][0][x] = 1<<6;
2121     }
2122 }
2123
2124
2125 /**
2126  * allocates tables.
2127  * needs width/height
2128  */
2129 static int alloc_tables(H264Context *h){
2130     MpegEncContext * const s = &h->s;
2131     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2132     int x,y;
2133
2134     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2135
2136     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2137     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2138     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2139
2140     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2141     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2142     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2143     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2144
2145     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2146     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2147
2148     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2149     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2150     for(y=0; y<s->mb_height; y++){
2151         for(x=0; x<s->mb_width; x++){
2152             const int mb_xy= x + y*s->mb_stride;
2153             const int b_xy = 4*x + 4*y*h->b_stride;
2154             const int b8_xy= 2*x + 2*y*h->b8_stride;
2155
2156             h->mb2b_xy [mb_xy]= b_xy;
2157             h->mb2b8_xy[mb_xy]= b8_xy;
2158         }
2159     }
2160
2161     s->obmc_scratchpad = NULL;
2162
2163     if(!h->dequant4_coeff[0])
2164         init_dequant_tables(h);
2165
2166     return 0;
2167 fail:
2168     free_tables(h);
2169     return -1;
2170 }
2171
2172 /**
2173  * Mimic alloc_tables(), but for every context thread.
2174  */
2175 static void clone_tables(H264Context *dst, H264Context *src){
2176     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2177     dst->non_zero_count           = src->non_zero_count;
2178     dst->slice_table              = src->slice_table;
2179     dst->cbp_table                = src->cbp_table;
2180     dst->mb2b_xy                  = src->mb2b_xy;
2181     dst->mb2b8_xy                 = src->mb2b8_xy;
2182     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2183     dst->mvd_table[0]             = src->mvd_table[0];
2184     dst->mvd_table[1]             = src->mvd_table[1];
2185     dst->direct_table             = src->direct_table;
2186
2187     dst->s.obmc_scratchpad = NULL;
2188     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2189 }
2190
2191 /**
2192  * Init context
2193  * Allocate buffers which are not shared amongst multiple threads.
2194  */
2195 static int context_init(H264Context *h){
2196     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2197     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2198
2199     return 0;
2200 fail:
2201     return -1; // free_tables will clean up for us
2202 }
2203
2204 static av_cold void common_init(H264Context *h){
2205     MpegEncContext * const s = &h->s;
2206
2207     s->width = s->avctx->width;
2208     s->height = s->avctx->height;
2209     s->codec_id= s->avctx->codec->id;
2210
2211     ff_h264_pred_init(&h->hpc, s->codec_id);
2212
2213     h->dequant_coeff_pps= -1;
2214     s->unrestricted_mv=1;
2215     s->decode=1; //FIXME
2216
2217     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2218     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2219 }
2220
2221 static av_cold int decode_init(AVCodecContext *avctx){
2222     H264Context *h= avctx->priv_data;
2223     MpegEncContext * const s = &h->s;
2224
2225     MPV_decode_defaults(s);
2226
2227     s->avctx = avctx;
2228     common_init(h);
2229
2230     s->out_format = FMT_H264;
2231     s->workaround_bugs= avctx->workaround_bugs;
2232
2233     // set defaults
2234 //    s->decode_mb= ff_h263_decode_mb;
2235     s->quarter_sample = 1;
2236     s->low_delay= 1;
2237
2238     // Set in decode_postinit() once initial parsing is complete
2239     avctx->pix_fmt = PIX_FMT_NONE;
2240
2241     decode_init_vlc();
2242
2243     if(avctx->extradata_size > 0 && avctx->extradata &&
2244        *(char *)avctx->extradata == 1){
2245         h->is_avc = 1;
2246         h->got_avcC = 0;
2247     } else {
2248         h->is_avc = 0;
2249     }
2250
2251     h->thread_context[0] = h;
2252     h->outputed_poc = INT_MIN;
2253     h->prev_poc_msb= 1<<16;
2254     return 0;
2255 }
2256
2257 static int decode_postinit(H264Context *h, SPS *sps){
2258     AVCodecContext * const avctx= h->s.avctx;
2259
2260     if (avctx->pix_fmt != PIX_FMT_NONE){
2261         return 0;
2262     }
2263
2264     if (avctx->vdpau_acceleration) {
2265         if(h->s.chroma_format >= 2) {
2266             return -2;
2267         }
2268         if (sps->profile_idc == 66) {
2269             avctx->pix_fmt = avctx->get_format(avctx, pixfmt_vdpau_h264_baseline_420);
2270         } else if (sps->profile_idc == 77) {
2271             avctx->pix_fmt = avctx->get_format(avctx, pixfmt_vdpau_h264_main_420);
2272         } else if (sps->profile_idc == 100) {
2273             avctx->pix_fmt = avctx->get_format(avctx, pixfmt_vdpau_h264_high_420);
2274         } else {
2275             return -2;
2276         }
2277     } else if (avctx->codec_id == CODEC_ID_SVQ3) {
2278         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2279     } else {
2280         avctx->pix_fmt= PIX_FMT_YUV420P;
2281     }
2282
2283     return 0;
2284 }
2285
2286 static int frame_start(H264Context *h){
2287     MpegEncContext * const s = &h->s;
2288     int i;
2289
2290 #ifdef HAVE_VDPAU
2291     ff_VDPAU_h264_set_reference_frames_count(h);
2292 #endif
2293
2294     if(MPV_frame_start(s, s->avctx) < 0)
2295         return -1;
2296     ff_er_frame_start(s);
2297     /*
2298      * MPV_frame_start uses pict_type to derive key_frame.
2299      * This is incorrect for H.264; IDR markings must be used.
2300      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2301      * See decode_nal_units().
2302      */
2303     s->current_picture_ptr->key_frame= 0;
2304
2305     assert(s->linesize && s->uvlinesize);
2306
2307     for(i=0; i<16; i++){
2308         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2309         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2310     }
2311     for(i=0; i<4; i++){
2312         h->block_offset[16+i]=
2313         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2314         h->block_offset[24+16+i]=
2315         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2316     }
2317
2318     /* can't be in alloc_tables because linesize isn't known there.
2319      * FIXME: redo bipred weight to not require extra buffer? */
2320     for(i = 0; i < s->avctx->thread_count; i++)
2321         if(!h->thread_context[i]->s.obmc_scratchpad)
2322             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2323
2324     /* some macroblocks will be accessed before they're available */
2325     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2326         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2327
2328 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2329
2330     // We mark the current picture as non-reference after allocating it, so
2331     // that if we break out due to an error it can be released automatically
2332     // in the next MPV_frame_start().
2333     // SVQ3 as well as most other codecs have only last/next/current and thus
2334     // get released even with set reference, besides SVQ3 and others do not
2335     // mark frames as reference later "naturally".
2336     if(s->codec_id != CODEC_ID_SVQ3)
2337         s->current_picture_ptr->reference= 0;
2338
2339     s->current_picture_ptr->field_poc[0]=
2340     s->current_picture_ptr->field_poc[1]= INT_MAX;
2341     assert(s->current_picture_ptr->long_ref==0);
2342
2343     return 0;
2344 }
2345
2346 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2347     MpegEncContext * const s = &h->s;
2348     int i;
2349     int step    = 1;
2350     int offset  = 1;
2351     int uvoffset= 1;
2352     int top_idx = 1;
2353     int skiplast= 0;
2354
2355     src_y  -=   linesize;
2356     src_cb -= uvlinesize;
2357     src_cr -= uvlinesize;
2358
2359     if(!simple && FRAME_MBAFF){
2360         if(s->mb_y&1){
2361             offset  = MB_MBAFF ? 1 : 17;
2362             uvoffset= MB_MBAFF ? 1 : 9;
2363             if(!MB_MBAFF){
2364                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2365                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2366                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2367                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2368                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2369                 }
2370             }
2371         }else{
2372             if(!MB_MBAFF){
2373                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2374                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2375                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2376                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2377                 }
2378                 skiplast= 1;
2379             }
2380             offset  =
2381             uvoffset=
2382             top_idx = MB_MBAFF ? 0 : 1;
2383         }
2384         step= MB_MBAFF ? 2 : 1;
2385     }
2386
2387     // There are two lines saved, the line above the the top macroblock of a pair,
2388     // and the line above the bottom macroblock
2389     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2390     for(i=1; i<17 - skiplast; i++){
2391         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2392     }
2393
2394     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2395     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2396
2397     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2398         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2399         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2400         for(i=1; i<9 - skiplast; i++){
2401             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2402             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2403         }
2404         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2405         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2406     }
2407 }
2408
2409 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2410     MpegEncContext * const s = &h->s;
2411     int temp8, i;
2412     uint64_t temp64;
2413     int deblock_left;
2414     int deblock_top;
2415     int mb_xy;
2416     int step    = 1;
2417     int offset  = 1;
2418     int uvoffset= 1;
2419     int top_idx = 1;
2420
2421     if(!simple && FRAME_MBAFF){
2422         if(s->mb_y&1){
2423             offset  = MB_MBAFF ? 1 : 17;
2424             uvoffset= MB_MBAFF ? 1 : 9;
2425         }else{
2426             offset  =
2427             uvoffset=
2428             top_idx = MB_MBAFF ? 0 : 1;
2429         }
2430         step= MB_MBAFF ? 2 : 1;
2431     }
2432
2433     if(h->deblocking_filter == 2) {
2434         mb_xy = h->mb_xy;
2435         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2436         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2437     } else {
2438         deblock_left = (s->mb_x > 0);
2439         deblock_top =  (s->mb_y > !!MB_FIELD);
2440     }
2441
2442     src_y  -=   linesize + 1;
2443     src_cb -= uvlinesize + 1;
2444     src_cr -= uvlinesize + 1;
2445
2446 #define XCHG(a,b,t,xchg)\
2447 t= a;\
2448 if(xchg)\
2449     a= b;\
2450 b= t;
2451
2452     if(deblock_left){
2453         for(i = !deblock_top; i<16; i++){
2454             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2455         }
2456         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2457     }
2458
2459     if(deblock_top){
2460         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2461         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2462         if(s->mb_x+1 < s->mb_width){
2463             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2464         }
2465     }
2466
2467     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2468         if(deblock_left){
2469             for(i = !deblock_top; i<8; i++){
2470                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2471                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2472             }
2473             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2474             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2475         }
2476         if(deblock_top){
2477             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2478             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2479         }
2480     }
2481 }
2482
2483 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2484     MpegEncContext * const s = &h->s;
2485     const int mb_x= s->mb_x;
2486     const int mb_y= s->mb_y;
2487     const int mb_xy= h->mb_xy;
2488     const int mb_type= s->current_picture.mb_type[mb_xy];
2489     uint8_t  *dest_y, *dest_cb, *dest_cr;
2490     int linesize, uvlinesize /*dct_offset*/;
2491     int i;
2492     int *block_offset = &h->block_offset[0];
2493     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2494     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2495     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2496
2497     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2498     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2499     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2500
2501     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2502     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2503
2504     if (!simple && MB_FIELD) {
2505         linesize   = h->mb_linesize   = s->linesize * 2;
2506         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2507         block_offset = &h->block_offset[24];
2508         if(mb_y&1){ //FIXME move out of this function?
2509             dest_y -= s->linesize*15;
2510             dest_cb-= s->uvlinesize*7;
2511             dest_cr-= s->uvlinesize*7;
2512         }
2513         if(FRAME_MBAFF) {
2514             int list;
2515             for(list=0; list<h->list_count; list++){
2516                 if(!USES_LIST(mb_type, list))
2517                     continue;
2518                 if(IS_16X16(mb_type)){
2519                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2520                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2521                 }else{
2522                     for(i=0; i<16; i+=4){
2523                         int ref = h->ref_cache[list][scan8[i]];
2524                         if(ref >= 0)
2525                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2526                     }
2527                 }
2528             }
2529         }
2530     } else {
2531         linesize   = h->mb_linesize   = s->linesize;
2532         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2533 //        dct_offset = s->linesize * 16;
2534     }
2535
2536     if(transform_bypass){
2537         idct_dc_add =
2538         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2539     }else if(IS_8x8DCT(mb_type)){
2540         idct_dc_add = s->dsp.h264_idct8_dc_add;
2541         idct_add = s->dsp.h264_idct8_add;
2542     }else{
2543         idct_dc_add = s->dsp.h264_idct_dc_add;
2544         idct_add = s->dsp.h264_idct_add;
2545     }
2546
2547     if (!simple && IS_INTRA_PCM(mb_type)) {
2548         for (i=0; i<16; i++) {
2549             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2550         }
2551         for (i=0; i<8; i++) {
2552             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2553             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2554         }
2555     } else {
2556         if(IS_INTRA(mb_type)){
2557             if(h->deblocking_filter)
2558                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2559
2560             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2561                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2562                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2563             }
2564
2565             if(IS_INTRA4x4(mb_type)){
2566                 if(simple || !s->encoding){
2567                     if(IS_8x8DCT(mb_type)){
2568                         for(i=0; i<16; i+=4){
2569                             uint8_t * const ptr= dest_y + block_offset[i];
2570                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2571                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2572                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2573                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2574                             if(nnz){
2575                                 if(nnz == 1 && h->mb[i*16])
2576                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2577                                 else
2578                                     idct_add(ptr, h->mb + i*16, linesize);
2579                             }
2580                         }
2581                     }else
2582                     for(i=0; i<16; i++){
2583                         uint8_t * const ptr= dest_y + block_offset[i];
2584                         uint8_t *topright;
2585                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2586                         int nnz, tr;
2587
2588                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2589                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2590                             assert(mb_y || linesize <= block_offset[i]);
2591                             if(!topright_avail){
2592                                 tr= ptr[3 - linesize]*0x01010101;
2593                                 topright= (uint8_t*) &tr;
2594                             }else
2595                                 topright= ptr + 4 - linesize;
2596                         }else
2597                             topright= NULL;
2598
2599                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2600                         nnz = h->non_zero_count_cache[ scan8[i] ];
2601                         if(nnz){
2602                             if(is_h264){
2603                                 if(nnz == 1 && h->mb[i*16])
2604                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2605                                 else
2606                                     idct_add(ptr, h->mb + i*16, linesize);
2607                             }else
2608                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2609                         }
2610                     }
2611                 }
2612             }else{
2613                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2614                 if(is_h264){
2615                     if(!transform_bypass)
2616                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2617                 }else
2618                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2619             }
2620             if(h->deblocking_filter)
2621                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2622         }else if(is_h264){
2623             hl_motion(h, dest_y, dest_cb, dest_cr,
2624                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2625                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2626                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2627         }
2628
2629
2630         if(!IS_INTRA4x4(mb_type)){
2631             if(is_h264){
2632                 if(IS_INTRA16x16(mb_type)){
2633                     for(i=0; i<16; i++){
2634                         if(h->non_zero_count_cache[ scan8[i] ])
2635                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2636                         else if(h->mb[i*16])
2637                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2638                     }
2639                 }else{
2640                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2641                     for(i=0; i<16; i+=di){
2642                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2643                         if(nnz){
2644                             if(nnz==1 && h->mb[i*16])
2645                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2646                             else
2647                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2648                         }
2649                     }
2650                 }
2651             }else{
2652                 for(i=0; i<16; i++){
2653                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2654                         uint8_t * const ptr= dest_y + block_offset[i];
2655                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2656                     }
2657                 }
2658             }
2659         }
2660
2661         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2662             uint8_t *dest[2] = {dest_cb, dest_cr};
2663             if(transform_bypass){
2664                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2665             }else{
2666                 idct_add = s->dsp.h264_idct_add;
2667                 idct_dc_add = s->dsp.h264_idct_dc_add;
2668                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2669                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2670             }
2671             if(is_h264){
2672                 for(i=16; i<16+8; i++){
2673                     if(h->non_zero_count_cache[ scan8[i] ])
2674                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2675                     else if(h->mb[i*16])
2676                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2677                 }
2678             }else{
2679                 for(i=16; i<16+8; i++){
2680                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2681                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2682                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2683                     }
2684                 }
2685             }
2686         }
2687     }
2688     if(h->deblocking_filter) {
2689         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2690         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2691         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2692         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2693         if (!simple && FRAME_MBAFF) {
2694             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2695         } else {
2696             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2697         }
2698     }
2699 }
2700
2701 /**
2702  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2703  */
2704 static void hl_decode_mb_simple(H264Context *h){
2705     hl_decode_mb_internal(h, 1);
2706 }
2707
2708 /**
2709  * Process a macroblock; this handles edge cases, such as interlacing.
2710  */
2711 static void av_noinline hl_decode_mb_complex(H264Context *h){
2712     hl_decode_mb_internal(h, 0);
2713 }
2714
2715 static void hl_decode_mb(H264Context *h){
2716     MpegEncContext * const s = &h->s;
2717     const int mb_xy= h->mb_xy;
2718     const int mb_type= s->current_picture.mb_type[mb_xy];
2719     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2720                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2721
2722     if(ENABLE_H264_ENCODER && !s->decode)
2723         return;
2724
2725     if (is_complex)
2726         hl_decode_mb_complex(h);
2727     else hl_decode_mb_simple(h);
2728 }
2729
2730 static void pic_as_field(Picture *pic, const int parity){
2731     int i;
2732     for (i = 0; i < 4; ++i) {
2733         if (parity == PICT_BOTTOM_FIELD)
2734             pic->data[i] += pic->linesize[i];
2735         pic->reference = parity;
2736         pic->linesize[i] *= 2;
2737     }
2738     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2739 }
2740
2741 static int split_field_copy(Picture *dest, Picture *src,
2742                             int parity, int id_add){
2743     int match = !!(src->reference & parity);
2744
2745     if (match) {
2746         *dest = *src;
2747         if(parity != PICT_FRAME){
2748             pic_as_field(dest, parity);
2749             dest->pic_id *= 2;
2750             dest->pic_id += id_add;
2751         }
2752     }
2753
2754     return match;
2755 }
2756
2757 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2758     int i[2]={0};
2759     int index=0;
2760
2761     while(i[0]<len || i[1]<len){
2762         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2763             i[0]++;
2764         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2765             i[1]++;
2766         if(i[0] < len){
2767             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2768             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2769         }
2770         if(i[1] < len){
2771             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2772             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2773         }
2774     }
2775
2776     return index;
2777 }
2778
2779 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2780     int i, best_poc;
2781     int out_i= 0;
2782
2783     for(;;){
2784         best_poc= dir ? INT_MIN : INT_MAX;
2785
2786         for(i=0; i<len; i++){
2787             const int poc= src[i]->poc;
2788             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2789                 best_poc= poc;
2790                 sorted[out_i]= src[i];
2791             }
2792         }
2793         if(best_poc == (dir ? INT_MIN : INT_MAX))
2794             break;
2795         limit= sorted[out_i++]->poc - dir;
2796     }
2797     return out_i;
2798 }
2799
2800 /**
2801  * fills the default_ref_list.
2802  */
2803 static int fill_default_ref_list(H264Context *h){
2804     MpegEncContext * const s = &h->s;
2805     int i, len;
2806
2807     if(h->slice_type_nos==FF_B_TYPE){
2808         Picture *sorted[32];
2809         int cur_poc, list;
2810         int lens[2];
2811
2812         if(FIELD_PICTURE)
2813             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2814         else
2815             cur_poc= s->current_picture_ptr->poc;
2816
2817         for(list= 0; list<2; list++){
2818             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2819             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2820             assert(len<=32);
2821             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2822             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2823             assert(len<=32);
2824
2825             if(len < h->ref_count[list])
2826                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2827             lens[list]= len;
2828         }
2829
2830         if(lens[0] == lens[1] && lens[1] > 1){
2831             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2832             if(i == lens[0])
2833                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2834         }
2835     }else{
2836         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2837         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2838         assert(len <= 32);
2839         if(len < h->ref_count[0])
2840             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2841     }
2842 #ifdef TRACE
2843     for (i=0; i<h->ref_count[0]; i++) {
2844         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2845     }
2846     if(h->slice_type_nos==FF_B_TYPE){
2847         for (i=0; i<h->ref_count[1]; i++) {
2848             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2849         }
2850     }
2851 #endif
2852     return 0;
2853 }
2854
2855 static void print_short_term(H264Context *h);
2856 static void print_long_term(H264Context *h);
2857
2858 /**
2859  * Extract structure information about the picture described by pic_num in
2860  * the current decoding context (frame or field). Note that pic_num is
2861  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2862  * @param pic_num picture number for which to extract structure information
2863  * @param structure one of PICT_XXX describing structure of picture
2864  *                      with pic_num
2865  * @return frame number (short term) or long term index of picture
2866  *         described by pic_num
2867  */
2868 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2869     MpegEncContext * const s = &h->s;
2870
2871     *structure = s->picture_structure;
2872     if(FIELD_PICTURE){
2873         if (!(pic_num & 1))
2874             /* opposite field */
2875             *structure ^= PICT_FRAME;
2876         pic_num >>= 1;
2877     }
2878
2879     return pic_num;
2880 }
2881
2882 static int decode_ref_pic_list_reordering(H264Context *h){
2883     MpegEncContext * const s = &h->s;
2884     int list, index, pic_structure;
2885
2886     print_short_term(h);
2887     print_long_term(h);
2888
2889     for(list=0; list<h->list_count; list++){
2890         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2891
2892         if(get_bits1(&s->gb)){
2893             int pred= h->curr_pic_num;
2894
2895             for(index=0; ; index++){
2896                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2897                 unsigned int pic_id;
2898                 int i;
2899                 Picture *ref = NULL;
2900
2901                 if(reordering_of_pic_nums_idc==3)
2902                     break;
2903
2904                 if(index >= h->ref_count[list]){
2905                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2906                     return -1;
2907                 }
2908
2909                 if(reordering_of_pic_nums_idc<3){
2910                     if(reordering_of_pic_nums_idc<2){
2911                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2912                         int frame_num;
2913
2914                         if(abs_diff_pic_num > h->max_pic_num){
2915                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2916                             return -1;
2917                         }
2918
2919                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2920                         else                                pred+= abs_diff_pic_num;
2921                         pred &= h->max_pic_num - 1;
2922
2923                         frame_num = pic_num_extract(h, pred, &pic_structure);
2924
2925                         for(i= h->short_ref_count-1; i>=0; i--){
2926                             ref = h->short_ref[i];
2927                             assert(ref->reference);
2928                             assert(!ref->long_ref);
2929                             if(
2930                                    ref->frame_num == frame_num &&
2931                                    (ref->reference & pic_structure)
2932                               )
2933                                 break;
2934                         }
2935                         if(i>=0)
2936                             ref->pic_id= pred;
2937                     }else{
2938                         int long_idx;
2939                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2940
2941                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2942
2943                         if(long_idx>31){
2944                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2945                             return -1;
2946                         }
2947                         ref = h->long_ref[long_idx];
2948                         assert(!(ref && !ref->reference));
2949                         if(ref && (ref->reference & pic_structure)){
2950                             ref->pic_id= pic_id;
2951                             assert(ref->long_ref);
2952                             i=0;
2953                         }else{
2954                             i=-1;
2955                         }
2956                     }
2957
2958                     if (i < 0) {
2959                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2960                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2961                     } else {
2962                         for(i=index; i+1<h->ref_count[list]; i++){
2963                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2964                                 break;
2965                         }
2966                         for(; i > index; i--){
2967                             h->ref_list[list][i]= h->ref_list[list][i-1];
2968                         }
2969                         h->ref_list[list][index]= *ref;
2970                         if (FIELD_PICTURE){
2971                             pic_as_field(&h->ref_list[list][index], pic_structure);
2972                         }
2973                     }
2974                 }else{
2975                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2976                     return -1;
2977                 }
2978             }
2979         }
2980     }
2981     for(list=0; list<h->list_count; list++){
2982         for(index= 0; index < h->ref_count[list]; index++){
2983             if(!h->ref_list[list][index].data[0]){
2984                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2985                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2986             }
2987         }
2988     }
2989
2990     return 0;
2991 }
2992
2993 static void fill_mbaff_ref_list(H264Context *h){
2994     int list, i, j;
2995     for(list=0; list<2; list++){ //FIXME try list_count
2996         for(i=0; i<h->ref_count[list]; i++){
2997             Picture *frame = &h->ref_list[list][i];
2998             Picture *field = &h->ref_list[list][16+2*i];
2999             field[0] = *frame;
3000             for(j=0; j<3; j++)
3001                 field[0].linesize[j] <<= 1;
3002             field[0].reference = PICT_TOP_FIELD;
3003             field[0].poc= field[0].field_poc[0];
3004             field[1] = field[0];
3005             for(j=0; j<3; j++)
3006                 field[1].data[j] += frame->linesize[j];
3007             field[1].reference = PICT_BOTTOM_FIELD;
3008             field[1].poc= field[1].field_poc[1];
3009
3010             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3011             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3012             for(j=0; j<2; j++){
3013                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3014                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3015             }
3016         }
3017     }
3018     for(j=0; j<h->ref_count[1]; j++){
3019         for(i=0; i<h->ref_count[0]; i++)
3020             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3021         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3022         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3023     }
3024 }
3025
3026 static int pred_weight_table(H264Context *h){
3027     MpegEncContext * const s = &h->s;
3028     int list, i;
3029     int luma_def, chroma_def;
3030
3031     h->use_weight= 0;
3032     h->use_weight_chroma= 0;
3033     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3034     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3035     luma_def = 1<<h->luma_log2_weight_denom;
3036     chroma_def = 1<<h->chroma_log2_weight_denom;
3037
3038     for(list=0; list<2; list++){
3039         for(i=0; i<h->ref_count[list]; i++){
3040             int luma_weight_flag, chroma_weight_flag;
3041
3042             luma_weight_flag= get_bits1(&s->gb);
3043             if(luma_weight_flag){
3044                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3045                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3046                 if(   h->luma_weight[list][i] != luma_def
3047                    || h->luma_offset[list][i] != 0)
3048                     h->use_weight= 1;
3049             }else{
3050                 h->luma_weight[list][i]= luma_def;
3051                 h->luma_offset[list][i]= 0;
3052             }
3053
3054             if(CHROMA){
3055                 chroma_weight_flag= get_bits1(&s->gb);
3056                 if(chroma_weight_flag){
3057                     int j;
3058                     for(j=0; j<2; j++){
3059                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3060                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3061                         if(   h->chroma_weight[list][i][j] != chroma_def
3062                         || h->chroma_offset[list][i][j] != 0)
3063                             h->use_weight_chroma= 1;
3064                     }
3065                 }else{
3066                     int j;
3067                     for(j=0; j<2; j++){
3068                         h->chroma_weight[list][i][j]= chroma_def;
3069                         h->chroma_offset[list][i][j]= 0;
3070                     }
3071                 }
3072             }
3073         }
3074         if(h->slice_type_nos != FF_B_TYPE) break;
3075     }
3076     h->use_weight= h->use_weight || h->use_weight_chroma;
3077     return 0;
3078 }
3079
3080 static void implicit_weight_table(H264Context *h){
3081     MpegEncContext * const s = &h->s;
3082     int ref0, ref1;
3083     int cur_poc = s->current_picture_ptr->poc;
3084
3085     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3086        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3087         h->use_weight= 0;
3088         h->use_weight_chroma= 0;
3089         return;
3090     }
3091
3092     h->use_weight= 2;
3093     h->use_weight_chroma= 2;
3094     h->luma_log2_weight_denom= 5;
3095     h->chroma_log2_weight_denom= 5;
3096
3097     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3098         int poc0 = h->ref_list[0][ref0].poc;
3099         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3100             int poc1 = h->ref_list[1][ref1].poc;
3101             int td = av_clip(poc1 - poc0, -128, 127);
3102             if(td){
3103                 int tb = av_clip(cur_poc - poc0, -128, 127);
3104                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3105                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3106                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3107                     h->implicit_weight[ref0][ref1] = 32;
3108                 else
3109                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3110             }else
3111                 h->implicit_weight[ref0][ref1] = 32;
3112         }
3113     }
3114 }
3115
3116 /**
3117  * Mark a picture as no longer needed for reference. The refmask
3118  * argument allows unreferencing of individual fields or the whole frame.
3119  * If the picture becomes entirely unreferenced, but is being held for
3120  * display purposes, it is marked as such.
3121  * @param refmask mask of fields to unreference; the mask is bitwise
3122  *                anded with the reference marking of pic
3123  * @return non-zero if pic becomes entirely unreferenced (except possibly
3124  *         for display purposes) zero if one of the fields remains in
3125  *         reference
3126  */
3127 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3128     int i;
3129     if (pic->reference &= refmask) {
3130         return 0;
3131     } else {
3132         for(i = 0; h->delayed_pic[i]; i++)
3133             if(pic == h->delayed_pic[i]){
3134                 pic->reference=DELAYED_PIC_REF;
3135                 break;
3136             }
3137         return 1;
3138     }
3139 }
3140
3141 /**
3142  * instantaneous decoder refresh.
3143  */
3144 static void idr(H264Context *h){
3145     int i;
3146
3147     for(i=0; i<16; i++){
3148         remove_long(h, i, 0);
3149     }
3150     assert(h->long_ref_count==0);
3151
3152     for(i=0; i<h->short_ref_count; i++){
3153         unreference_pic(h, h->short_ref[i], 0);
3154         h->short_ref[i]= NULL;
3155     }
3156     h->short_ref_count=0;
3157     h->prev_frame_num= 0;
3158     h->prev_frame_num_offset= 0;
3159     h->prev_poc_msb=
3160     h->prev_poc_lsb= 0;
3161 }
3162
3163 /* forget old pics after a seek */
3164 static void flush_dpb(AVCodecContext *avctx){
3165     H264Context *h= avctx->priv_data;
3166     int i;
3167     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3168         if(h->delayed_pic[i])
3169             h->delayed_pic[i]->reference= 0;
3170         h->delayed_pic[i]= NULL;
3171     }
3172     h->outputed_poc= INT_MIN;
3173     idr(h);
3174     if(h->s.current_picture_ptr)
3175         h->s.current_picture_ptr->reference= 0;
3176     h->s.first_field= 0;
3177     ff_mpeg_flush(avctx);
3178 }
3179
3180 /**
3181  * Find a Picture in the short term reference list by frame number.
3182  * @param frame_num frame number to search for
3183  * @param idx the index into h->short_ref where returned picture is found
3184  *            undefined if no picture found.
3185  * @return pointer to the found picture, or NULL if no pic with the provided
3186  *                 frame number is found
3187  */
3188 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3189     MpegEncContext * const s = &h->s;
3190     int i;
3191
3192     for(i=0; i<h->short_ref_count; i++){
3193         Picture *pic= h->short_ref[i];
3194         if(s->avctx->debug&FF_DEBUG_MMCO)
3195             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3196         if(pic->frame_num == frame_num) {
3197             *idx = i;
3198             return pic;
3199         }
3200     }
3201     return NULL;
3202 }
3203
3204 /**
3205  * Remove a picture from the short term reference list by its index in
3206  * that list.  This does no checking on the provided index; it is assumed
3207  * to be valid. Other list entries are shifted down.
3208  * @param i index into h->short_ref of picture to remove.
3209  */
3210 static void remove_short_at_index(H264Context *h, int i){
3211     assert(i >= 0 && i < h->short_ref_count);
3212     h->short_ref[i]= NULL;
3213     if (--h->short_ref_count)
3214         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3215 }
3216
3217 /**
3218  *
3219  * @return the removed picture or NULL if an error occurs
3220  */
3221 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3222     MpegEncContext * const s = &h->s;
3223     Picture *pic;
3224     int i;
3225
3226     if(s->avctx->debug&FF_DEBUG_MMCO)
3227         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3228
3229     pic = find_short(h, frame_num, &i);
3230     if (pic){
3231         if(unreference_pic(h, pic, ref_mask))
3232         remove_short_at_index(h, i);
3233     }
3234
3235     return pic;
3236 }
3237
3238 /**
3239  * Remove a picture from the long term reference list by its index in
3240  * that list.
3241  * @return the removed picture or NULL if an error occurs
3242  */
3243 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3244     Picture *pic;
3245
3246     pic= h->long_ref[i];
3247     if (pic){
3248         if(unreference_pic(h, pic, ref_mask)){
3249             assert(h->long_ref[i]->long_ref == 1);
3250             h->long_ref[i]->long_ref= 0;
3251             h->long_ref[i]= NULL;
3252             h->long_ref_count--;
3253         }
3254     }
3255
3256     return pic;
3257 }
3258
3259 /**
3260  * print short term list
3261  */
3262 static void print_short_term(H264Context *h) {
3263     uint32_t i;
3264     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3265         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3266         for(i=0; i<h->short_ref_count; i++){
3267             Picture *pic= h->short_ref[i];
3268             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3269         }
3270     }
3271 }
3272
3273 /**
3274  * print long term list
3275  */
3276 static void print_long_term(H264Context *h) {
3277     uint32_t i;
3278     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3279         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3280         for(i = 0; i < 16; i++){
3281             Picture *pic= h->long_ref[i];
3282             if (pic) {
3283                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3284             }
3285         }
3286     }
3287 }
3288
3289 /**
3290  * Executes the reference picture marking (memory management control operations).
3291  */
3292 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3293     MpegEncContext * const s = &h->s;
3294     int i, j;
3295     int current_ref_assigned=0;
3296     Picture *pic;
3297
3298     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3299         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3300
3301     for(i=0; i<mmco_count; i++){
3302         int structure, frame_num;
3303         if(s->avctx->debug&FF_DEBUG_MMCO)
3304             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3305
3306         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3307            || mmco[i].opcode == MMCO_SHORT2LONG){
3308             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3309             pic = find_short(h, frame_num, &j);
3310             if(!pic){
3311                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3312                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3313                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3314                 continue;
3315             }
3316         }
3317
3318         switch(mmco[i].opcode){
3319         case MMCO_SHORT2UNUSED:
3320             if(s->avctx->debug&FF_DEBUG_MMCO)
3321                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3322             remove_short(h, frame_num, structure ^ PICT_FRAME);
3323             break;
3324         case MMCO_SHORT2LONG:
3325                 if (h->long_ref[mmco[i].long_arg] != pic)
3326                     remove_long(h, mmco[i].long_arg, 0);
3327
3328                 remove_short_at_index(h, j);
3329                 h->long_ref[ mmco[i].long_arg ]= pic;
3330                 if (h->long_ref[ mmco[i].long_arg ]){
3331                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3332                     h->long_ref_count++;
3333                 }
3334             break;
3335         case MMCO_LONG2UNUSED:
3336             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3337             pic = h->long_ref[j];
3338             if (pic) {
3339                 remove_long(h, j, structure ^ PICT_FRAME);
3340             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3341                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3342             break;
3343         case MMCO_LONG:
3344                     // Comment below left from previous code as it is an interresting note.
3345                     /* First field in pair is in short term list or
3346                      * at a different long term index.
3347                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3348                      * Report the problem and keep the pair where it is,
3349                      * and mark this field valid.
3350                      */
3351
3352             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3353                 remove_long(h, mmco[i].long_arg, 0);
3354
3355                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3356                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3357                 h->long_ref_count++;
3358             }
3359
3360             s->current_picture_ptr->reference |= s->picture_structure;
3361             current_ref_assigned=1;
3362             break;
3363         case MMCO_SET_MAX_LONG:
3364             assert(mmco[i].long_arg <= 16);
3365             // just remove the long term which index is greater than new max
3366             for(j = mmco[i].long_arg; j<16; j++){
3367                 remove_long(h, j, 0);
3368             }
3369             break;
3370         case MMCO_RESET:
3371             while(h->short_ref_count){
3372                 remove_short(h, h->short_ref[0]->frame_num, 0);
3373             }
3374             for(j = 0; j < 16; j++) {
3375                 remove_long(h, j, 0);
3376             }
3377             s->current_picture_ptr->poc=
3378             s->current_picture_ptr->field_poc[0]=
3379             s->current_picture_ptr->field_poc[1]=
3380             h->poc_lsb=
3381             h->poc_msb=
3382             h->frame_num=
3383             s->current_picture_ptr->frame_num= 0;
3384             break;
3385         default: assert(0);
3386         }
3387     }
3388
3389     if (!current_ref_assigned) {
3390         /* Second field of complementary field pair; the first field of
3391          * which is already referenced. If short referenced, it
3392          * should be first entry in short_ref. If not, it must exist
3393          * in long_ref; trying to put it on the short list here is an
3394          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3395          */
3396         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3397             /* Just mark the second field valid */
3398             s->current_picture_ptr->reference = PICT_FRAME;
3399         } else if (s->current_picture_ptr->long_ref) {
3400             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3401                                              "assignment for second field "
3402                                              "in complementary field pair "
3403                                              "(first field is long term)\n");
3404         } else {
3405             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3406             if(pic){
3407                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3408             }
3409
3410             if(h->short_ref_count)
3411                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3412
3413             h->short_ref[0]= s->current_picture_ptr;
3414             h->short_ref_count++;
3415             s->current_picture_ptr->reference |= s->picture_structure;
3416         }
3417     }
3418
3419     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3420
3421         /* We have too many reference frames, probably due to corrupted
3422          * stream. Need to discard one frame. Prevents overrun of the
3423          * short_ref and long_ref buffers.
3424          */
3425         av_log(h->s.avctx, AV_LOG_ERROR,
3426                "number of reference frames exceeds max (probably "
3427                "corrupt input), discarding one\n");
3428
3429         if (h->long_ref_count && !h->short_ref_count) {
3430             for (i = 0; i < 16; ++i)
3431                 if (h->long_ref[i])
3432                     break;
3433
3434             assert(i < 16);
3435             remove_long(h, i, 0);
3436         } else {
3437             pic = h->short_ref[h->short_ref_count - 1];
3438             remove_short(h, pic->frame_num, 0);
3439         }
3440     }
3441
3442     print_short_term(h);
3443     print_long_term(h);
3444     return 0;
3445 }
3446
3447 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3448     MpegEncContext * const s = &h->s;
3449     int i;
3450
3451     h->mmco_index= 0;
3452     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3453         s->broken_link= get_bits1(gb) -1;
3454         if(get_bits1(gb)){
3455             h->mmco[0].opcode= MMCO_LONG;
3456             h->mmco[0].long_arg= 0;
3457             h->mmco_index= 1;
3458         }
3459     }else{
3460         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3461             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3462                 MMCOOpcode opcode= get_ue_golomb(gb);
3463
3464                 h->mmco[i].opcode= opcode;
3465                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3466                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3467 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3468                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3469                         return -1;
3470                     }*/
3471                 }
3472                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3473                     unsigned int long_arg= get_ue_golomb(gb);
3474                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3475                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3476                         return -1;
3477                     }
3478                     h->mmco[i].long_arg= long_arg;
3479                 }
3480
3481                 if(opcode > (unsigned)MMCO_LONG){
3482                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3483                     return -1;
3484                 }
3485                 if(opcode == MMCO_END)
3486                     break;
3487             }
3488             h->mmco_index= i;
3489         }else{
3490             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3491
3492             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3493                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3494                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3495                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3496                 h->mmco_index= 1;
3497                 if (FIELD_PICTURE) {
3498                     h->mmco[0].short_pic_num *= 2;
3499                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3500                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3501                     h->mmco_index= 2;
3502                 }
3503             }
3504         }
3505     }
3506
3507     return 0;
3508 }
3509
3510 static int init_poc(H264Context *h){
3511     MpegEncContext * const s = &h->s;
3512     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3513     int field_poc[2];
3514     Picture *cur = s->current_picture_ptr;
3515
3516     h->frame_num_offset= h->prev_frame_num_offset;
3517     if(h->frame_num < h->prev_frame_num)
3518         h->frame_num_offset += max_frame_num;
3519
3520     if(h->sps.poc_type==0){
3521         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3522
3523         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3524             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3525         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3526             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3527         else
3528             h->poc_msb = h->prev_poc_msb;
3529 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3530         field_poc[0] =
3531         field_poc[1] = h->poc_msb + h->poc_lsb;
3532         if(s->picture_structure == PICT_FRAME)
3533             field_poc[1] += h->delta_poc_bottom;
3534     }else if(h->sps.poc_type==1){
3535         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3536         int i;
3537
3538         if(h->sps.poc_cycle_length != 0)
3539             abs_frame_num = h->frame_num_offset + h->frame_num;
3540         else
3541             abs_frame_num = 0;
3542
3543         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3544             abs_frame_num--;
3545
3546         expected_delta_per_poc_cycle = 0;
3547         for(i=0; i < h->sps.poc_cycle_length; i++)
3548             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3549
3550         if(abs_frame_num > 0){
3551             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3552             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3553
3554             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3555             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3556                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3557         } else
3558             expectedpoc = 0;
3559
3560         if(h->nal_ref_idc == 0)
3561             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3562
3563         field_poc[0] = expectedpoc + h->delta_poc[0];
3564         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3565
3566         if(s->picture_structure == PICT_FRAME)
3567             field_poc[1] += h->delta_poc[1];
3568     }else{
3569         int poc= 2*(h->frame_num_offset + h->frame_num);
3570
3571         if(!h->nal_ref_idc)
3572             poc--;
3573
3574         field_poc[0]= poc;
3575         field_poc[1]= poc;
3576     }
3577
3578     if(s->picture_structure != PICT_BOTTOM_FIELD)
3579         s->current_picture_ptr->field_poc[0]= field_poc[0];
3580     if(s->picture_structure != PICT_TOP_FIELD)
3581         s->current_picture_ptr->field_poc[1]= field_poc[1];
3582     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3583
3584     return 0;
3585 }
3586
3587
3588 /**
3589  * initialize scan tables
3590  */
3591 static void init_scan_tables(H264Context *h){
3592     MpegEncContext * const s = &h->s;
3593     int i;
3594     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3595         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3596         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3597     }else{
3598         for(i=0; i<16; i++){
3599 #define T(x) (x>>2) | ((x<<2) & 0xF)
3600             h->zigzag_scan[i] = T(zigzag_scan[i]);
3601             h-> field_scan[i] = T( field_scan[i]);
3602 #undef T
3603         }
3604     }
3605     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3606         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3607         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3608         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3609         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3610     }else{
3611         for(i=0; i<64; i++){
3612 #define T(x) (x>>3) | ((x&7)<<3)
3613             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3614             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3615             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3616             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3617 #undef T
3618         }
3619     }
3620     if(h->sps.transform_bypass){ //FIXME same ugly
3621         h->zigzag_scan_q0          = zigzag_scan;
3622         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3623         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3624         h->field_scan_q0           = field_scan;
3625         h->field_scan8x8_q0        = field_scan8x8;
3626         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3627     }else{
3628         h->zigzag_scan_q0          = h->zigzag_scan;
3629         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3630         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3631         h->field_scan_q0           = h->field_scan;
3632         h->field_scan8x8_q0        = h->field_scan8x8;
3633         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3634     }
3635 }
3636
3637 /**
3638  * Replicates H264 "master" context to thread contexts.
3639  */
3640 static void clone_slice(H264Context *dst, H264Context *src)
3641 {
3642     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3643     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3644     dst->s.current_picture      = src->s.current_picture;
3645     dst->s.linesize             = src->s.linesize;
3646     dst->s.uvlinesize           = src->s.uvlinesize;
3647     dst->s.first_field          = src->s.first_field;
3648
3649     dst->prev_poc_msb           = src->prev_poc_msb;
3650     dst->prev_poc_lsb           = src->prev_poc_lsb;
3651     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3652     dst->prev_frame_num         = src->prev_frame_num;
3653     dst->short_ref_count        = src->short_ref_count;
3654
3655     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3656     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3657     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3658     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3659
3660     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3661     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3662 }
3663
3664 /**
3665  * decodes a slice header.
3666  * This will also call MPV_common_init() and frame_start() as needed.
3667  *
3668  * @param h h264context
3669  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3670  *
3671  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3672  */
3673 static int decode_slice_header(H264Context *h, H264Context *h0){
3674     MpegEncContext * const s = &h->s;
3675     MpegEncContext * const s0 = &h0->s;
3676     unsigned int first_mb_in_slice;
3677     unsigned int pps_id;
3678     int num_ref_idx_active_override_flag;
3679     unsigned int slice_type, tmp, i, j;
3680     int default_ref_list_done = 0;
3681     int last_pic_structure;
3682
3683     s->dropable= h->nal_ref_idc == 0;
3684
3685     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3686         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3687         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3688     }else{
3689         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3690         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3691     }
3692
3693     first_mb_in_slice= get_ue_golomb(&s->gb);
3694
3695     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3696         h0->current_slice = 0;
3697         if (!s0->first_field)
3698             s->current_picture_ptr= NULL;
3699     }
3700
3701     slice_type= get_ue_golomb(&s->gb);
3702     if(slice_type > 9){
3703         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3704         return -1;
3705     }
3706     if(slice_type > 4){
3707         slice_type -= 5;
3708         h->slice_type_fixed=1;
3709     }else
3710         h->slice_type_fixed=0;
3711
3712     slice_type= golomb_to_pict_type[ slice_type ];
3713     if (slice_type == FF_I_TYPE
3714         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3715         default_ref_list_done = 1;
3716     }
3717     h->slice_type= slice_type;
3718     h->slice_type_nos= slice_type & 3;
3719
3720     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3721     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3722         av_log(h->s.avctx, AV_LOG_ERROR,
3723                "B picture before any references, skipping\n");
3724         return -1;
3725     }
3726
3727     pps_id= get_ue_golomb(&s->gb);
3728     if(pps_id>=MAX_PPS_COUNT){
3729         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3730         return -1;
3731     }
3732     if(!h0->pps_buffers[pps_id]) {
3733         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3734         return -1;
3735     }
3736     h->pps= *h0->pps_buffers[pps_id];
3737
3738     if(!h0->sps_buffers[h->pps.sps_id]) {
3739         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3740         return -1;
3741     }
3742     h->sps = *h0->sps_buffers[h->pps.sps_id];
3743
3744     if(h == h0 && h->dequant_coeff_pps != pps_id){
3745         h->dequant_coeff_pps = pps_id;
3746         init_dequant_tables(h);
3747     }
3748
3749     s->mb_width= h->sps.mb_width;
3750     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3751
3752     h->b_stride=  s->mb_width*4;
3753     h->b8_stride= s->mb_width*2;
3754
3755     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3756     if(h->sps.frame_mbs_only_flag)
3757         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3758     else
3759         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3760
3761     if (s->context_initialized
3762         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3763         if(h != h0)
3764             return -1;   // width / height changed during parallelized decoding
3765         free_tables(h);
3766         MPV_common_end(s);
3767     }
3768     if (!s->context_initialized) {
3769         if(h != h0)
3770             return -1;  // we cant (re-)initialize context during parallel decoding
3771         if (MPV_common_init(s) < 0)
3772             return -1;
3773         s->first_field = 0;
3774
3775         init_scan_tables(h);
3776         alloc_tables(h);
3777
3778         for(i = 1; i < s->avctx->thread_count; i++) {
3779             H264Context *c;
3780             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3781             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3782             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3783             c->sps = h->sps;
3784             c->pps = h->pps;
3785             init_scan_tables(c);
3786             clone_tables(c, h);
3787         }
3788
3789         for(i = 0; i < s->avctx->thread_count; i++)
3790             if(context_init(h->thread_context[i]) < 0)
3791                 return -1;
3792
3793         s->avctx->width = s->width;
3794         s->avctx->height = s->height;
3795         s->avctx->sample_aspect_ratio= h->sps.sar;
3796         if(!s->avctx->sample_aspect_ratio.den)
3797             s->avctx->sample_aspect_ratio.den = 1;
3798
3799         if(h->sps.timing_info_present_flag){
3800             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3801             if(h->x264_build > 0 && h->x264_build < 44)
3802                 s->avctx->time_base.den *= 2;
3803             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3804                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3805         }
3806     }
3807
3808     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3809
3810     h->mb_mbaff = 0;
3811     h->mb_aff_frame = 0;
3812     last_pic_structure = s0->picture_structure;
3813     if(h->sps.frame_mbs_only_flag){
3814         s->picture_structure= PICT_FRAME;
3815     }else{
3816         if(get_bits1(&s->gb)) { //field_pic_flag
3817             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3818         } else {
3819             s->picture_structure= PICT_FRAME;
3820             h->mb_aff_frame = h->sps.mb_aff;
3821         }
3822     }
3823     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3824
3825     if(h0->current_slice == 0){
3826         while(h->frame_num !=  h->prev_frame_num &&
3827               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3828             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3829             frame_start(h);
3830             h->prev_frame_num++;
3831             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3832             s->current_picture_ptr->frame_num= h->prev_frame_num;
3833             execute_ref_pic_marking(h, NULL, 0);
3834         }
3835
3836         /* See if we have a decoded first field looking for a pair... */
3837         if (s0->first_field) {
3838             assert(s0->current_picture_ptr);
3839             assert(s0->current_picture_ptr->data[0]);
3840             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3841
3842             /* figure out if we have a complementary field pair */
3843             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3844                 /*
3845                  * Previous field is unmatched. Don't display it, but let it
3846                  * remain for reference if marked as such.
3847                  */
3848                 s0->current_picture_ptr = NULL;
3849                 s0->first_field = FIELD_PICTURE;
3850
3851             } else {
3852                 if (h->nal_ref_idc &&
3853                         s0->current_picture_ptr->reference &&
3854                         s0->current_picture_ptr->frame_num != h->frame_num) {
3855                     /*
3856                      * This and previous field were reference, but had
3857                      * different frame_nums. Consider this field first in
3858                      * pair. Throw away previous field except for reference
3859                      * purposes.
3860                      */
3861                     s0->first_field = 1;
3862                     s0->current_picture_ptr = NULL;
3863
3864                 } else {
3865                     /* Second field in complementary pair */
3866                     s0->first_field = 0;
3867                 }
3868             }
3869
3870         } else {
3871             /* Frame or first field in a potentially complementary pair */
3872             assert(!s0->current_picture_ptr);
3873             s0->first_field = FIELD_PICTURE;
3874         }
3875
3876         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3877             s0->first_field = 0;
3878             return -1;
3879         }
3880     }
3881     if(h != h0)
3882         clone_slice(h, h0);
3883
3884     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3885
3886     assert(s->mb_num == s->mb_width * s->mb_height);
3887     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3888        first_mb_in_slice                    >= s->mb_num){
3889         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3890         return -1;
3891     }
3892     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3893     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3894     if (s->picture_structure == PICT_BOTTOM_FIELD)
3895         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3896     assert(s->mb_y < s->mb_height);
3897
3898     if(s->picture_structure==PICT_FRAME){
3899         h->curr_pic_num=   h->frame_num;
3900         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3901     }else{
3902         h->curr_pic_num= 2*h->frame_num + 1;
3903         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3904     }
3905
3906     if(h->nal_unit_type == NAL_IDR_SLICE){
3907         get_ue_golomb(&s->gb); /* idr_pic_id */
3908     }
3909
3910     if(h->sps.poc_type==0){
3911         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3912
3913         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3914             h->delta_poc_bottom= get_se_golomb(&s->gb);
3915         }
3916     }
3917
3918     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3919         h->delta_poc[0]= get_se_golomb(&s->gb);
3920
3921         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3922             h->delta_poc[1]= get_se_golomb(&s->gb);
3923     }
3924
3925     init_poc(h);
3926
3927     if(h->pps.redundant_pic_cnt_present){
3928         h->redundant_pic_count= get_ue_golomb(&s->gb);
3929     }
3930
3931     //set defaults, might be overridden a few lines later
3932     h->ref_count[0]= h->pps.ref_count[0];
3933     h->ref_count[1]= h->pps.ref_count[1];
3934
3935     if(h->slice_type_nos != FF_I_TYPE){
3936         if(h->slice_type_nos == FF_B_TYPE){
3937             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3938         }
3939         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3940
3941         if(num_ref_idx_active_override_flag){
3942             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3943             if(h->slice_type_nos==FF_B_TYPE)
3944                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3945
3946             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3947                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3948                 h->ref_count[0]= h->ref_count[1]= 1;
3949                 return -1;
3950             }
3951         }
3952         if(h->slice_type_nos == FF_B_TYPE)
3953             h->list_count= 2;
3954         else
3955             h->list_count= 1;
3956     }else
3957         h->list_count= 0;
3958
3959     if(!default_ref_list_done){
3960         fill_default_ref_list(h);
3961     }
3962
3963     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3964         return -1;
3965
3966     if(h->slice_type_nos!=FF_I_TYPE){
3967         s->last_picture_ptr= &h->ref_list[0][0];
3968         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3969     }
3970     if(h->slice_type_nos==FF_B_TYPE){
3971         s->next_picture_ptr= &h->ref_list[1][0];
3972         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3973     }
3974
3975     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3976        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3977         pred_weight_table(h);
3978     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3979         implicit_weight_table(h);
3980     else
3981         h->use_weight = 0;
3982
3983     if(h->nal_ref_idc)
3984         decode_ref_pic_marking(h0, &s->gb);
3985
3986     if(FRAME_MBAFF)
3987         fill_mbaff_ref_list(h);
3988
3989     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3990         direct_dist_scale_factor(h);
3991     direct_ref_list_init(h);
3992
3993     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3994         tmp = get_ue_golomb(&s->gb);
3995         if(tmp > 2){
3996             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3997             return -1;
3998         }
3999         h->cabac_init_idc= tmp;
4000     }
4001
4002     h->last_qscale_diff = 0;
4003     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4004     if(tmp>51){
4005         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4006         return -1;
4007     }
4008     s->qscale= tmp;
4009     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4010     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4011     //FIXME qscale / qp ... stuff
4012     if(h->slice_type == FF_SP_TYPE){
4013         get_bits1(&s->gb); /* sp_for_switch_flag */
4014     }
4015     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4016         get_se_golomb(&s->gb); /* slice_qs_delta */
4017     }
4018
4019     h->deblocking_filter = 1;
4020     h->slice_alpha_c0_offset = 0;
4021     h->slice_beta_offset = 0;
4022     if( h->pps.deblocking_filter_parameters_present ) {
4023         tmp= get_ue_golomb(&s->gb);
4024         if(tmp > 2){
4025             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4026             return -1;
4027         }
4028         h->deblocking_filter= tmp;
4029         if(h->deblocking_filter < 2)
4030             h->deblocking_filter^= 1; // 1<->0
4031
4032         if( h->deblocking_filter ) {
4033             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4034             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4035         }
4036     }
4037
4038     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4039        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4040        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4041        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4042         h->deblocking_filter= 0;
4043
4044     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4045         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4046             /* Cheat slightly for speed:
4047                Do not bother to deblock across slices. */
4048             h->deblocking_filter = 2;
4049         } else {
4050             h0->max_contexts = 1;
4051             if(!h0->single_decode_warning) {
4052                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4053                 h0->single_decode_warning = 1;
4054             }
4055             if(h != h0)
4056                 return 1; // deblocking switched inside frame
4057         }
4058     }
4059
4060 #if 0 //FMO
4061     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4062         slice_group_change_cycle= get_bits(&s->gb, ?);
4063 #endif
4064
4065     h0->last_slice_type = slice_type;
4066     h->slice_num = ++h0->current_slice;
4067     if(h->slice_num >= MAX_SLICES){
4068         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4069     }
4070
4071     for(j=0; j<2; j++){
4072         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4073         ref2frm[0]=
4074         ref2frm[1]= -1;
4075         for(i=0; i<16; i++)
4076             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4077                           +(h->ref_list[j][i].reference&3);
4078         ref2frm[18+0]=
4079         ref2frm[18+1]= -1;
4080         for(i=16; i<48; i++)
4081             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4082                           +(h->ref_list[j][i].reference&3);
4083     }
4084
4085     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4086     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4087
4088     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4089         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4090                h->slice_num,
4091                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4092                first_mb_in_slice,
4093                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4094                pps_id, h->frame_num,
4095                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4096                h->ref_count[0], h->ref_count[1],
4097                s->qscale,
4098                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4099                h->use_weight,
4100                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4101                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4102                );
4103     }
4104
4105     return 0;
4106 }
4107
4108 /**
4109  *
4110  */
4111 static inline int get_level_prefix(GetBitContext *gb){
4112     unsigned int buf;
4113     int log;
4114
4115     OPEN_READER(re, gb);
4116     UPDATE_CACHE(re, gb);
4117     buf=GET_CACHE(re, gb);
4118
4119     log= 32 - av_log2(buf);
4120 #ifdef TRACE
4121     print_bin(buf>>(32-log), log);
4122     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4123 #endif
4124
4125     LAST_SKIP_BITS(re, gb, log);
4126     CLOSE_READER(re, gb);
4127
4128     return log-1;
4129 }
4130
4131 static inline int get_dct8x8_allowed(H264Context *h){
4132     int i;
4133     for(i=0; i<4; i++){
4134         if(!IS_SUB_8X8(h->sub_mb_type[i])
4135            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4136             return 0;
4137     }
4138     return 1;
4139 }
4140
4141 /**
4142  * decodes a residual block.
4143  * @param n block index
4144  * @param scantable scantable
4145  * @param max_coeff number of coefficients in the block
4146  * @return <0 if an error occurred
4147  */
4148 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4149     MpegEncContext * const s = &h->s;
4150     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4151     int level[16];
4152     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4153
4154     //FIXME put trailing_onex into the context
4155
4156     if(n == CHROMA_DC_BLOCK_INDEX){
4157         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4158         total_coeff= coeff_token>>2;
4159     }else{
4160         if(n == LUMA_DC_BLOCK_INDEX){
4161             total_coeff= pred_non_zero_count(h, 0);
4162             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4163             total_coeff= coeff_token>>2;
4164         }else{
4165             total_coeff= pred_non_zero_count(h, n);
4166             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4167             total_coeff= coeff_token>>2;
4168             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4169         }
4170     }
4171
4172     //FIXME set last_non_zero?
4173
4174     if(total_coeff==0)
4175         return 0;
4176     if(total_coeff > (unsigned)max_coeff) {
4177         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4178         return -1;
4179     }
4180
4181     trailing_ones= coeff_token&3;
4182     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4183     assert(total_coeff<=16);
4184
4185     for(i=0; i<trailing_ones; i++){
4186         level[i]= 1 - 2*get_bits1(gb);
4187     }
4188
4189     if(i<total_coeff) {
4190         int level_code, mask;
4191         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4192         int prefix= get_level_prefix(gb);
4193
4194         //first coefficient has suffix_length equal to 0 or 1
4195         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4196             if(suffix_length)
4197                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4198             else
4199                 level_code= (prefix<<suffix_length); //part
4200         }else if(prefix==14){
4201             if(suffix_length)
4202                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4203             else
4204                 level_code= prefix + get_bits(gb, 4); //part
4205         }else{
4206             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4207             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4208             if(prefix>=16)
4209                 level_code += (1<<(prefix-3))-4096;
4210         }
4211
4212         if(trailing_ones < 3) level_code += 2;
4213
4214         suffix_length = 1;
4215         if(level_code > 5)
4216             suffix_length++;
4217         mask= -(level_code&1);
4218         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4219         i++;
4220
4221         //remaining coefficients have suffix_length > 0
4222         for(;i<total_coeff;i++) {
4223             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4224             prefix = get_level_prefix(gb);
4225             if(prefix<15){
4226                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4227             }else{
4228                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4229                 if(prefix>=16)
4230                     level_code += (1<<(prefix-3))-4096;
4231             }
4232             mask= -(level_code&1);
4233             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4234             if(level_code > suffix_limit[suffix_length])
4235                 suffix_length++;
4236         }
4237     }
4238
4239     if(total_coeff == max_coeff)
4240         zeros_left=0;
4241     else{
4242         if(n == CHROMA_DC_BLOCK_INDEX)
4243             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4244         else
4245             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4246     }
4247
4248     coeff_num = zeros_left + total_coeff - 1;
4249     j = scantable[coeff_num];
4250     if(n > 24){
4251         block[j] = level[0];
4252         for(i=1;i<total_coeff;i++) {
4253             if(zeros_left <= 0)
4254                 run_before = 0;
4255             else if(zeros_left < 7){
4256                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4257             }else{
4258                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4259             }
4260             zeros_left -= run_before;
4261             coeff_num -= 1 + run_before;
4262             j= scantable[ coeff_num ];
4263
4264             block[j]= level[i];
4265         }
4266     }else{
4267         block[j] = (level[0] * qmul[j] + 32)>>6;
4268         for(i=1;i<total_coeff;i++) {
4269             if(zeros_left <= 0)
4270                 run_before = 0;
4271             else if(zeros_left < 7){
4272                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4273             }else{
4274                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4275             }
4276             zeros_left -= run_before;
4277             coeff_num -= 1 + run_before;
4278             j= scantable[ coeff_num ];
4279
4280             block[j]= (level[i] * qmul[j] + 32)>>6;
4281         }
4282     }
4283
4284     if(zeros_left<0){
4285         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4286         return -1;
4287     }
4288
4289     return 0;
4290 }
4291
4292 static void predict_field_decoding_flag(H264Context *h){
4293     MpegEncContext * const s = &h->s;
4294     const int mb_xy= h->mb_xy;
4295     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4296                 ? s->current_picture.mb_type[mb_xy-1]
4297                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4298                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4299                 : 0;
4300     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4301 }
4302
4303 /**
4304  * decodes a P_SKIP or B_SKIP macroblock
4305  */
4306 static void decode_mb_skip(H264Context *h){
4307     MpegEncContext * const s = &h->s;
4308     const int mb_xy= h->mb_xy;
4309     int mb_type=0;
4310
4311     memset(h->non_zero_count[mb_xy], 0, 16);
4312     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4313
4314     if(MB_FIELD)
4315         mb_type|= MB_TYPE_INTERLACED;
4316
4317     if( h->slice_type_nos == FF_B_TYPE )
4318     {
4319         // just for fill_caches. pred_direct_motion will set the real mb_type
4320         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4321
4322         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4323         pred_direct_motion(h, &mb_type);
4324         mb_type|= MB_TYPE_SKIP;
4325     }
4326     else
4327     {
4328         int mx, my;
4329         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4330
4331         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4332         pred_pskip_motion(h, &mx, &my);
4333         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4334         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4335     }
4336
4337     write_back_motion(h, mb_type);
4338     s->current_picture.mb_type[mb_xy]= mb_type;
4339     s->current_picture.qscale_table[mb_xy]= s->qscale;
4340     h->slice_table[ mb_xy ]= h->slice_num;
4341     h->prev_mb_skipped= 1;
4342 }
4343
4344 /**
4345  * decodes a macroblock
4346  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4347  */
4348 static int decode_mb_cavlc(H264Context *h){
4349     MpegEncContext * const s = &h->s;
4350     int mb_xy;
4351     int partition_count;
4352     unsigned int mb_type, cbp;
4353     int dct8x8_allowed= h->pps.transform_8x8_mode;
4354
4355     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4356
4357     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4358
4359     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4360     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4361                 down the code */
4362     if(h->slice_type_nos != FF_I_TYPE){
4363         if(s->mb_skip_run==-1)
4364             s->mb_skip_run= get_ue_golomb(&s->gb);
4365
4366         if (s->mb_skip_run--) {
4367             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4368                 if(s->mb_skip_run==0)
4369                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4370                 else
4371                     predict_field_decoding_flag(h);
4372             }
4373             decode_mb_skip(h);
4374             return 0;
4375         }
4376     }
4377     if(FRAME_MBAFF){
4378         if( (s->mb_y&1) == 0 )
4379             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4380     }
4381
4382     h->prev_mb_skipped= 0;
4383
4384     mb_type= get_ue_golomb(&s->gb);
4385     if(h->slice_type_nos == FF_B_TYPE){
4386         if(mb_type < 23){
4387             partition_count= b_mb_type_info[mb_type].partition_count;
4388             mb_type=         b_mb_type_info[mb_type].type;
4389         }else{
4390             mb_type -= 23;
4391             goto decode_intra_mb;
4392         }
4393     }else if(h->slice_type_nos == FF_P_TYPE){
4394         if(mb_type < 5){
4395             partition_count= p_mb_type_info[mb_type].partition_count;
4396             mb_type=         p_mb_type_info[mb_type].type;
4397         }else{
4398             mb_type -= 5;
4399             goto decode_intra_mb;
4400         }
4401     }else{
4402        assert(h->slice_type_nos == FF_I_TYPE);
4403         if(h->slice_type == FF_SI_TYPE && mb_type)
4404             mb_type--;
4405 decode_intra_mb:
4406         if(mb_type > 25){
4407             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4408             return -1;
4409         }
4410         partition_count=0;
4411         cbp= i_mb_type_info[mb_type].cbp;
4412         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4413         mb_type= i_mb_type_info[mb_type].type;
4414     }
4415
4416     if(MB_FIELD)
4417         mb_type |= MB_TYPE_INTERLACED;
4418
4419     h->slice_table[ mb_xy ]= h->slice_num;
4420
4421     if(IS_INTRA_PCM(mb_type)){
4422         unsigned int x;
4423
4424         // We assume these blocks are very rare so we do not optimize it.
4425         align_get_bits(&s->gb);
4426
4427         // The pixels are stored in the same order as levels in h->mb array.
4428         for(x=0; x < (CHROMA ? 384 : 256); x++){
4429             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4430         }
4431
4432         // In deblocking, the quantizer is 0
4433         s->current_picture.qscale_table[mb_xy]= 0;
4434         // All coeffs are present
4435         memset(h->non_zero_count[mb_xy], 16, 16);
4436
4437         s->current_picture.mb_type[mb_xy]= mb_type;
4438         return 0;
4439     }
4440
4441     if(MB_MBAFF){
4442         h->ref_count[0] <<= 1;
4443         h->ref_count[1] <<= 1;
4444     }
4445
4446     fill_caches(h, mb_type, 0);
4447
4448     //mb_pred
4449     if(IS_INTRA(mb_type)){
4450         int pred_mode;
4451 //            init_top_left_availability(h);
4452         if(IS_INTRA4x4(mb_type)){
4453             int i;
4454             int di = 1;
4455             if(dct8x8_allowed && get_bits1(&s->gb)){
4456                 mb_type |= MB_TYPE_8x8DCT;
4457                 di = 4;
4458             }
4459
4460 //                fill_intra4x4_pred_table(h);
4461             for(i=0; i<16; i+=di){
4462                 int mode= pred_intra_mode(h, i);
4463
4464                 if(!get_bits1(&s->gb)){
4465                     const int rem_mode= get_bits(&s->gb, 3);
4466                     mode = rem_mode + (rem_mode >= mode);
4467                 }
4468
4469                 if(di==4)
4470                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4471                 else
4472                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4473             }
4474             write_back_intra_pred_mode(h);
4475             if( check_intra4x4_pred_mode(h) < 0)
4476                 return -1;
4477         }else{
4478             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4479             if(h->intra16x16_pred_mode < 0)
4480                 return -1;
4481         }
4482         if(CHROMA){
4483             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4484             if(pred_mode < 0)
4485                 return -1;
4486             h->chroma_pred_mode= pred_mode;
4487         }
4488     }else if(partition_count==4){
4489         int i, j, sub_partition_count[4], list, ref[2][4];
4490
4491         if(h->slice_type_nos == FF_B_TYPE){
4492             for(i=0; i<4; i++){
4493                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4494                 if(h->sub_mb_type[i] >=13){
4495                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4496                     return -1;
4497                 }
4498                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4499                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4500             }
4501             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4502                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4503                 pred_direct_motion(h, &mb_type);
4504                 h->ref_cache[0][scan8[4]] =
4505                 h->ref_cache[1][scan8[4]] =
4506                 h->ref_cache[0][scan8[12]] =
4507                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4508             }
4509         }else{
4510             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4511             for(i=0; i<4; i++){
4512                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4513                 if(h->sub_mb_type[i] >=4){
4514                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4515                     return -1;
4516                 }
4517                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4518                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4519             }
4520         }
4521
4522         for(list=0; list<h->list_count; list++){
4523             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4524             for(i=0; i<4; i++){
4525                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4526                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4527                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4528                     if(tmp>=ref_count){
4529                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4530                         return -1;
4531                     }
4532                     ref[list][i]= tmp;
4533                 }else{
4534                  //FIXME
4535                     ref[list][i] = -1;
4536                 }
4537             }
4538         }
4539
4540         if(dct8x8_allowed)
4541             dct8x8_allowed = get_dct8x8_allowed(h);
4542
4543         for(list=0; list<h->list_count; list++){
4544             for(i=0; i<4; i++){
4545                 if(IS_DIRECT(h->sub_mb_type[i])) {
4546                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4547                     continue;
4548                 }
4549                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4550                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4551
4552                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4553                     const int sub_mb_type= h->sub_mb_type[i];
4554                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4555                     for(j=0; j<sub_partition_count[i]; j++){
4556                         int mx, my;
4557                         const int index= 4*i + block_width*j;
4558                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4559                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4560                         mx += get_se_golomb(&s->gb);
4561                         my += get_se_golomb(&s->gb);
4562                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4563
4564                         if(IS_SUB_8X8(sub_mb_type)){
4565                             mv_cache[ 1 ][0]=
4566                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4567                             mv_cache[ 1 ][1]=
4568                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4569                         }else if(IS_SUB_8X4(sub_mb_type)){
4570                             mv_cache[ 1 ][0]= mx;
4571                             mv_cache[ 1 ][1]= my;
4572                         }else if(IS_SUB_4X8(sub_mb_type)){
4573                             mv_cache[ 8 ][0]= mx;
4574                             mv_cache[ 8 ][1]= my;
4575                         }
4576                         mv_cache[ 0 ][0]= mx;
4577                         mv_cache[ 0 ][1]= my;
4578                     }
4579                 }else{
4580                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4581                     p[0] = p[1]=
4582                     p[8] = p[9]= 0;
4583                 }
4584             }
4585         }
4586     }else if(IS_DIRECT(mb_type)){
4587         pred_direct_motion(h, &mb_type);
4588         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4589     }else{
4590         int list, mx, my, i;
4591          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4592         if(IS_16X16(mb_type)){
4593             for(list=0; list<h->list_count; list++){
4594                     unsigned int val;
4595                     if(IS_DIR(mb_type, 0, list)){
4596                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4597                         if(val >= h->ref_count[list]){
4598                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4599                             return -1;
4600                         }
4601                     }else
4602                         val= LIST_NOT_USED&0xFF;
4603                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4604             }
4605             for(list=0; list<h->list_count; list++){
4606                 unsigned int val;
4607                 if(IS_DIR(mb_type, 0, list)){
4608                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4609                     mx += get_se_golomb(&s->gb);
4610                     my += get_se_golomb(&s->gb);
4611                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4612
4613                     val= pack16to32(mx,my);
4614                 }else
4615                     val=0;
4616                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4617             }
4618         }
4619         else if(IS_16X8(mb_type)){
4620             for(list=0; list<h->list_count; list++){
4621                     for(i=0; i<2; i++){
4622                         unsigned int val;
4623                         if(IS_DIR(mb_type, i, list)){
4624                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4625                             if(val >= h->ref_count[list]){
4626                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4627                                 return -1;
4628                             }
4629                         }else
4630                             val= LIST_NOT_USED&0xFF;
4631                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4632                     }
4633             }
4634             for(list=0; list<h->list_count; list++){
4635                 for(i=0; i<2; i++){
4636                     unsigned int val;
4637                     if(IS_DIR(mb_type, i, list)){
4638                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4639                         mx += get_se_golomb(&s->gb);
4640                         my += get_se_golomb(&s->gb);
4641                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4642
4643                         val= pack16to32(mx,my);
4644                     }else
4645                         val=0;
4646                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4647                 }
4648             }
4649         }else{
4650             assert(IS_8X16(mb_type));
4651             for(list=0; list<h->list_count; list++){
4652                     for(i=0; i<2; i++){
4653                         unsigned int val;
4654                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4655                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4656                             if(val >= h->ref_count[list]){
4657                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4658                                 return -1;
4659                             }
4660                         }else
4661                             val= LIST_NOT_USED&0xFF;
4662                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4663                     }
4664             }
4665             for(list=0; list<h->list_count; list++){
4666                 for(i=0; i<2; i++){
4667                     unsigned int val;
4668                     if(IS_DIR(mb_type, i, list)){
4669                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4670                         mx += get_se_golomb(&s->gb);
4671                         my += get_se_golomb(&s->gb);
4672                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4673
4674                         val= pack16to32(mx,my);
4675                     }else
4676                         val=0;
4677                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4678                 }
4679             }
4680         }
4681     }
4682
4683     if(IS_INTER(mb_type))
4684         write_back_motion(h, mb_type);
4685
4686     if(!IS_INTRA16x16(mb_type)){
4687         cbp= get_ue_golomb(&s->gb);
4688         if(cbp > 47){
4689             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4690             return -1;
4691         }
4692
4693         if(CHROMA){
4694             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4695             else                     cbp= golomb_to_inter_cbp   [cbp];
4696         }else{
4697             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4698             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4699         }
4700     }
4701     h->cbp = cbp;
4702
4703     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4704         if(get_bits1(&s->gb)){
4705             mb_type |= MB_TYPE_8x8DCT;
4706             h->cbp_table[mb_xy]= cbp;
4707         }
4708     }
4709     s->current_picture.mb_type[mb_xy]= mb_type;
4710
4711     if(cbp || IS_INTRA16x16(mb_type)){
4712         int i8x8, i4x4, chroma_idx;
4713         int dquant;
4714         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4715         const uint8_t *scan, *scan8x8, *dc_scan;
4716
4717 //        fill_non_zero_count_cache(h);
4718
4719         if(IS_INTERLACED(mb_type)){
4720             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4721             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4722             dc_scan= luma_dc_field_scan;
4723         }else{
4724             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4725             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4726             dc_scan= luma_dc_zigzag_scan;
4727         }
4728
4729         dquant= get_se_golomb(&s->gb);
4730
4731         if( dquant > 25 || dquant < -26 ){
4732             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4733             return -1;
4734         }
4735
4736         s->qscale += dquant;
4737         if(((unsigned)s->qscale) > 51){
4738             if(s->qscale<0) s->qscale+= 52;
4739             else            s->qscale-= 52;
4740         }
4741
4742         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4743         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4744         if(IS_INTRA16x16(mb_type)){
4745             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4746                 return -1; //FIXME continue if partitioned and other return -1 too
4747             }
4748
4749             assert((cbp&15) == 0 || (cbp&15) == 15);
4750
4751             if(cbp&15){
4752                 for(i8x8=0; i8x8<4; i8x8++){
4753                     for(i4x4=0; i4x4<4; i4x4++){
4754                         const int index= i4x4 + 4*i8x8;
4755                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4756                             return -1;
4757                         }
4758                     }
4759                 }
4760             }else{
4761                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4762             }
4763         }else{
4764             for(i8x8=0; i8x8<4; i8x8++){
4765                 if(cbp & (1<<i8x8)){
4766                     if(IS_8x8DCT(mb_type)){
4767                         DCTELEM *buf = &h->mb[64*i8x8];
4768                         uint8_t *nnz;
4769                         for(i4x4=0; i4x4<4; i4x4++){
4770                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4771                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4772                                 return -1;
4773                         }
4774                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4775                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4776                     }else{
4777                         for(i4x4=0; i4x4<4; i4x4++){
4778                             const int index= i4x4 + 4*i8x8;
4779
4780                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4781                                 return -1;
4782                             }
4783                         }
4784                     }
4785                 }else{
4786                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4787                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4788                 }
4789             }
4790         }
4791
4792         if(cbp&0x30){
4793             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4794                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4795                     return -1;
4796                 }
4797         }
4798
4799         if(cbp&0x20){
4800             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4801                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4802                 for(i4x4=0; i4x4<4; i4x4++){
4803                     const int index= 16 + 4*chroma_idx + i4x4;
4804                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4805                         return -1;
4806                     }
4807                 }
4808             }
4809         }else{
4810             uint8_t * const nnz= &h->non_zero_count_cache[0];
4811             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4812             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4813         }
4814     }else{
4815         uint8_t * const nnz= &h->non_zero_count_cache[0];
4816         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4817         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4818         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4819     }
4820     s->current_picture.qscale_table[mb_xy]= s->qscale;
4821     write_back_non_zero_count(h);
4822
4823     if(MB_MBAFF){
4824         h->ref_count[0] >>= 1;
4825         h->ref_count[1] >>= 1;
4826     }
4827
4828     return 0;
4829 }
4830
4831 static int decode_cabac_field_decoding_flag(H264Context *h) {
4832     MpegEncContext * const s = &h->s;
4833     const int mb_x = s->mb_x;
4834     const int mb_y = s->mb_y & ~1;
4835     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4836     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4837
4838     unsigned int ctx = 0;
4839
4840     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4841         ctx += 1;
4842     }
4843     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4844         ctx += 1;
4845     }
4846
4847     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4848 }
4849
4850 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4851     uint8_t *state= &h->cabac_state[ctx_base];
4852     int mb_type;
4853
4854     if(intra_slice){
4855         MpegEncContext * const s = &h->s;
4856         const int mba_xy = h->left_mb_xy[0];
4857         const int mbb_xy = h->top_mb_xy;
4858         int ctx=0;
4859         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4860             ctx++;
4861         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4862             ctx++;
4863         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4864             return 0;   /* I4x4 */
4865         state += 2;
4866     }else{
4867         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4868             return 0;   /* I4x4 */
4869     }
4870
4871     if( get_cabac_terminate( &h->cabac ) )
4872         return 25;  /* PCM */
4873
4874     mb_type = 1; /* I16x16 */
4875     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4876     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4877         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4878     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4879     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4880     return mb_type;
4881 }
4882
4883 static int decode_cabac_mb_type( H264Context *h ) {
4884     MpegEncContext * const s = &h->s;
4885
4886     if( h->slice_type_nos == FF_I_TYPE ) {
4887         return decode_cabac_intra_mb_type(h, 3, 1);
4888     } else if( h->slice_type_nos == FF_P_TYPE ) {
4889         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4890             /* P-type */
4891             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4892                 /* P_L0_D16x16, P_8x8 */
4893                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4894             } else {
4895                 /* P_L0_D8x16, P_L0_D16x8 */
4896                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4897             }
4898         } else {
4899             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4900         }
4901     } else if( h->slice_type_nos == FF_B_TYPE ) {
4902         const int mba_xy = h->left_mb_xy[0];
4903         const int mbb_xy = h->top_mb_xy;
4904         int ctx = 0;
4905         int bits;
4906
4907         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4908             ctx++;
4909         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4910             ctx++;
4911
4912         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4913             return 0; /* B_Direct_16x16 */
4914
4915         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4916             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4917         }
4918
4919         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4920         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4921         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4922         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4923         if( bits < 8 )
4924             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4925         else if( bits == 13 ) {
4926             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4927         } else if( bits == 14 )
4928             return 11; /* B_L1_L0_8x16 */
4929         else if( bits == 15 )
4930             return 22; /* B_8x8 */
4931
4932         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4933         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4934     } else {
4935         /* TODO SI/SP frames? */
4936         return -1;
4937     }
4938 }
4939
4940 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4941     MpegEncContext * const s = &h->s;
4942     int mba_xy, mbb_xy;
4943     int ctx = 0;
4944
4945     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4946         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4947         mba_xy = mb_xy - 1;
4948         if( (mb_y&1)
4949             && h->slice_table[mba_xy] == h->slice_num
4950             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4951             mba_xy += s->mb_stride;
4952         if( MB_FIELD ){
4953             mbb_xy = mb_xy - s->mb_stride;
4954             if( !(mb_y&1)
4955                 && h->slice_table[mbb_xy] == h->slice_num
4956                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4957                 mbb_xy -= s->mb_stride;
4958         }else
4959             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4960     }else{
4961         int mb_xy = h->mb_xy;
4962         mba_xy = mb_xy - 1;
4963         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4964     }
4965
4966     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4967         ctx++;
4968     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4969         ctx++;
4970
4971     if( h->slice_type_nos == FF_B_TYPE )
4972         ctx += 13;
4973     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4974 }
4975
4976 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4977     int mode = 0;
4978
4979     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4980         return pred_mode;
4981
4982     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4983     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4984     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4985
4986     if( mode >= pred_mode )
4987         return mode + 1;
4988     else
4989         return mode;
4990 }
4991
4992 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4993     const int mba_xy = h->left_mb_xy[0];
4994     const int mbb_xy = h->top_mb_xy;
4995
4996     int ctx = 0;
4997
4998     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4999     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5000         ctx++;
5001
5002     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5003         ctx++;
5004
5005     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5006         return 0;
5007
5008     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5009         return 1;
5010     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5011         return 2;
5012     else
5013         return 3;
5014 }
5015
5016 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5017     int cbp_b, cbp_a, ctx, cbp = 0;
5018
5019     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5020     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5021
5022     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5023     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5024     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5025     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5026     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5027     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5028     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5029     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5030     return cbp;
5031 }
5032 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5033     int ctx;
5034     int cbp_a, cbp_b;
5035
5036     cbp_a = (h->left_cbp>>4)&0x03;
5037     cbp_b = (h-> top_cbp>>4)&0x03;
5038
5039     ctx = 0;
5040     if( cbp_a > 0 ) ctx++;
5041     if( cbp_b > 0 ) ctx += 2;
5042     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5043         return 0;
5044
5045     ctx = 4;
5046     if( cbp_a == 2 ) ctx++;
5047     if( cbp_b == 2 ) ctx += 2;
5048     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5049 }
5050 static int decode_cabac_mb_dqp( H264Context *h) {
5051     int   ctx = 0;
5052     int   val = 0;
5053
5054     if( h->last_qscale_diff != 0 )
5055         ctx++;
5056
5057     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5058         if( ctx < 2 )
5059             ctx = 2;
5060         else
5061             ctx = 3;
5062         val++;
5063         if(val > 102) //prevent infinite loop
5064             return INT_MIN;
5065     }
5066
5067     if( val&0x01 )
5068         return (val + 1)/2;
5069     else
5070         return -(val + 1)/2;
5071 }
5072 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5073     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5074         return 0;   /* 8x8 */
5075     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5076         return 1;   /* 8x4 */
5077     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5078         return 2;   /* 4x8 */
5079     return 3;       /* 4x4 */
5080 }
5081 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5082     int type;
5083     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5084         return 0;   /* B_Direct_8x8 */
5085     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5086         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5087     type = 3;
5088     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5089         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5090             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5091         type += 4;
5092     }
5093     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5094     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5095     return type;
5096 }
5097
5098 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5099     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5100 }
5101
5102 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5103     int refa = h->ref_cache[list][scan8[n] - 1];
5104     int refb = h->ref_cache[list][scan8[n] - 8];
5105     int ref  = 0;
5106     int ctx  = 0;
5107
5108     if( h->slice_type_nos == FF_B_TYPE) {
5109         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5110             ctx++;
5111         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5112             ctx += 2;
5113     } else {
5114         if( refa > 0 )
5115             ctx++;
5116         if( refb > 0 )
5117             ctx += 2;
5118     }
5119
5120     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5121         ref++;
5122         if( ctx < 4 )
5123             ctx = 4;
5124         else
5125             ctx = 5;
5126         if(ref >= 32 /*h->ref_list[list]*/){
5127             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5128             return 0; //FIXME we should return -1 and check the return everywhere
5129         }
5130     }
5131     return ref;
5132 }
5133
5134 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5135     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5136                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5137     int ctxbase = (l == 0) ? 40 : 47;
5138     int ctx, mvd;
5139
5140     if( amvd < 3 )
5141         ctx = 0;
5142     else if( amvd > 32 )
5143         ctx = 2;
5144     else
5145         ctx = 1;
5146
5147     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5148         return 0;
5149
5150     mvd= 1;
5151     ctx= 3;
5152     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5153         mvd++;
5154         if( ctx < 6 )
5155             ctx++;
5156     }
5157
5158     if( mvd >= 9 ) {
5159         int k = 3;
5160         while( get_cabac_bypass( &h->cabac ) ) {
5161             mvd += 1 << k;
5162             k++;
5163             if(k>24){
5164                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5165                 return INT_MIN;
5166             }
5167         }
5168         while( k-- ) {
5169             if( get_cabac_bypass( &h->cabac ) )
5170                 mvd += 1 << k;
5171         }
5172     }
5173     return get_cabac_bypass_sign( &h->cabac, -mvd );
5174 }
5175
5176 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5177     int nza, nzb;
5178     int ctx = 0;
5179
5180     if( is_dc ) {
5181         if( cat == 0 ) {
5182             nza = h->left_cbp&0x100;
5183             nzb = h-> top_cbp&0x100;
5184         } else {
5185             nza = (h->left_cbp>>(6+idx))&0x01;
5186             nzb = (h-> top_cbp>>(6+idx))&0x01;
5187         }
5188     } else {
5189         if( cat == 4 ) {
5190             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5191             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5192         } else {
5193             assert(cat == 1 || cat == 2);
5194             nza = h->non_zero_count_cache[scan8[idx] - 1];
5195             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5196         }
5197     }
5198
5199     if( nza > 0 )
5200         ctx++;
5201
5202     if( nzb > 0 )
5203         ctx += 2;
5204
5205     return ctx + 4 * cat;
5206 }
5207
5208 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5209     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5210     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5211     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5212     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5213 };
5214
5215 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5216     static const int significant_coeff_flag_offset[2][6] = {
5217       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5218       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5219     };
5220     static const int last_coeff_flag_offset[2][6] = {
5221       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5222       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5223     };
5224     static const int coeff_abs_level_m1_offset[6] = {
5225         227+0, 227+10, 227+20, 227+30, 227+39, 426
5226     };
5227     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5228       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5229         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5230         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5231        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5232       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5233         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5234         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5235         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5236     };
5237     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5238      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5239      * map node ctx => cabac ctx for level=1 */
5240     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5241     /* map node ctx => cabac ctx for level>1 */
5242     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5243     static const uint8_t coeff_abs_level_transition[2][8] = {
5244     /* update node ctx after decoding a level=1 */
5245         { 1, 2, 3, 3, 4, 5, 6, 7 },
5246     /* update node ctx after decoding a level>1 */
5247         { 4, 4, 4, 4, 5, 6, 7, 7 }
5248     };
5249
5250     int index[64];
5251
5252     int av_unused last;
5253     int coeff_count = 0;
5254     int node_ctx = 0;
5255
5256     uint8_t *significant_coeff_ctx_base;
5257     uint8_t *last_coeff_ctx_base;
5258     uint8_t *abs_level_m1_ctx_base;
5259
5260 #ifndef ARCH_X86
5261 #define CABAC_ON_STACK
5262 #endif
5263 #ifdef CABAC_ON_STACK
5264 #define CC &cc
5265     CABACContext cc;
5266     cc.range     = h->cabac.range;
5267     cc.low       = h->cabac.low;
5268     cc.bytestream= h->cabac.bytestream;
5269 #else
5270 #define CC &h->cabac
5271 #endif
5272
5273
5274     /* cat: 0-> DC 16x16  n = 0
5275      *      1-> AC 16x16  n = luma4x4idx
5276      *      2-> Luma4x4   n = luma4x4idx
5277      *      3-> DC Chroma n = iCbCr
5278      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5279      *      5-> Luma8x8   n = 4 * luma8x8idx
5280      */
5281
5282     /* read coded block flag */
5283     if( is_dc || cat != 5 ) {
5284         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5285             if( !is_dc ) {
5286                 if( cat == 4 )
5287                     h->non_zero_count_cache[scan8[16+n]] = 0;
5288                 else
5289                     h->non_zero_count_cache[scan8[n]] = 0;
5290             }
5291
5292 #ifdef CABAC_ON_STACK
5293             h->cabac.range     = cc.range     ;
5294             h->cabac.low       = cc.low       ;
5295             h->cabac.bytestream= cc.bytestream;
5296 #endif
5297             return;
5298         }
5299     }
5300
5301     significant_coeff_ctx_base = h->cabac_state
5302         + significant_coeff_flag_offset[MB_FIELD][cat];
5303     last_coeff_ctx_base = h->cabac_state
5304         + last_coeff_flag_offset[MB_FIELD][cat];
5305     abs_level_m1_ctx_base = h->cabac_state
5306         + coeff_abs_level_m1_offset[cat];
5307
5308     if( !is_dc && cat == 5 ) {
5309 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5310         for(last= 0; last < coefs; last++) { \
5311             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5312             if( get_cabac( CC, sig_ctx )) { \
5313                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5314                 index[coeff_count++] = last; \
5315                 if( get_cabac( CC, last_ctx ) ) { \
5316                     last= max_coeff; \
5317                     break; \
5318                 } \
5319             } \
5320         }\
5321         if( last == max_coeff -1 ) {\
5322             index[coeff_count++] = last;\
5323         }
5324         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5325 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5326         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5327     } else {
5328         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5329 #else
5330         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5331     } else {
5332         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5333 #endif
5334     }
5335     assert(coeff_count > 0);
5336
5337     if( is_dc ) {
5338         if( cat == 0 )
5339             h->cbp_table[h->mb_xy] |= 0x100;
5340         else
5341             h->cbp_table[h->mb_xy] |= 0x40 << n;
5342     } else {
5343         if( cat == 5 )
5344             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5345         else if( cat == 4 )
5346             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5347         else {
5348             assert( cat == 1 || cat == 2 );
5349             h->non_zero_count_cache[scan8[n]] = coeff_count;
5350         }
5351     }
5352
5353     do {
5354         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5355
5356         int j= scantable[index[--coeff_count]];
5357
5358         if( get_cabac( CC, ctx ) == 0 ) {
5359             node_ctx = coeff_abs_level_transition[0][node_ctx];
5360             if( is_dc ) {
5361                 block[j] = get_cabac_bypass_sign( CC, -1);
5362             }else{
5363                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5364             }
5365         } else {
5366             int coeff_abs = 2;
5367             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5368             node_ctx = coeff_abs_level_transition[1][node_ctx];
5369
5370             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5371                 coeff_abs++;
5372             }
5373
5374             if( coeff_abs >= 15 ) {
5375                 int j = 0;
5376                 while( get_cabac_bypass( CC ) ) {
5377                     j++;
5378                 }
5379
5380                 coeff_abs=1;
5381                 while( j-- ) {
5382                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5383                 }
5384                 coeff_abs+= 14;
5385             }
5386
5387             if( is_dc ) {
5388                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5389             }else{
5390                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5391             }
5392         }
5393     } while( coeff_count );
5394 #ifdef CABAC_ON_STACK
5395             h->cabac.range     = cc.range     ;
5396             h->cabac.low       = cc.low       ;
5397             h->cabac.bytestream= cc.bytestream;
5398 #endif
5399
5400 }
5401
5402 #ifndef CONFIG_SMALL
5403 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5404     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5405 }
5406
5407 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5408     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5409 }
5410 #endif
5411
5412 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5413 #ifdef CONFIG_SMALL
5414     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5415 #else
5416     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5417     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5418 #endif
5419 }
5420
5421 static inline void compute_mb_neighbors(H264Context *h)
5422 {
5423     MpegEncContext * const s = &h->s;
5424     const int mb_xy  = h->mb_xy;
5425     h->top_mb_xy     = mb_xy - s->mb_stride;
5426     h->left_mb_xy[0] = mb_xy - 1;
5427     if(FRAME_MBAFF){
5428         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5429         const int top_pair_xy      = pair_xy     - s->mb_stride;
5430         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5431         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5432         const int curr_mb_frame_flag = !MB_FIELD;
5433         const int bottom = (s->mb_y & 1);
5434         if (bottom
5435                 ? !curr_mb_frame_flag // bottom macroblock
5436                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5437                 ) {
5438             h->top_mb_xy -= s->mb_stride;
5439         }
5440         if (left_mb_frame_flag != curr_mb_frame_flag) {
5441             h->left_mb_xy[0] = pair_xy - 1;
5442         }
5443     } else if (FIELD_PICTURE) {
5444         h->top_mb_xy -= s->mb_stride;
5445     }
5446     return;
5447 }
5448
5449 /**
5450  * decodes a macroblock
5451  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5452  */
5453 static int decode_mb_cabac(H264Context *h) {
5454     MpegEncContext * const s = &h->s;
5455     int mb_xy;
5456     int mb_type, partition_count, cbp = 0;
5457     int dct8x8_allowed= h->pps.transform_8x8_mode;
5458
5459     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5460
5461     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5462
5463     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5464     if( h->slice_type_nos != FF_I_TYPE ) {
5465         int skip;
5466         /* a skipped mb needs the aff flag from the following mb */
5467         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5468             predict_field_decoding_flag(h);
5469         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5470             skip = h->next_mb_skipped;
5471         else
5472             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5473         /* read skip flags */
5474         if( skip ) {
5475             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5476                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5477                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5478                 if(h->next_mb_skipped)
5479                     predict_field_decoding_flag(h);
5480                 else
5481                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5482             }
5483
5484             decode_mb_skip(h);
5485
5486             h->cbp_table[mb_xy] = 0;
5487             h->chroma_pred_mode_table[mb_xy] = 0;
5488             h->last_qscale_diff = 0;
5489
5490             return 0;
5491
5492         }
5493     }
5494     if(FRAME_MBAFF){
5495         if( (s->mb_y&1) == 0 )
5496             h->mb_mbaff =
5497             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5498     }
5499
5500     h->prev_mb_skipped = 0;
5501
5502     compute_mb_neighbors(h);
5503     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5504         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5505         return -1;
5506     }
5507
5508     if( h->slice_type_nos == FF_B_TYPE ) {
5509         if( mb_type < 23 ){
5510             partition_count= b_mb_type_info[mb_type].partition_count;
5511             mb_type=         b_mb_type_info[mb_type].type;
5512         }else{
5513             mb_type -= 23;
5514             goto decode_intra_mb;
5515         }
5516     } else if( h->slice_type_nos == FF_P_TYPE ) {
5517         if( mb_type < 5) {
5518             partition_count= p_mb_type_info[mb_type].partition_count;
5519             mb_type=         p_mb_type_info[mb_type].type;
5520         } else {
5521             mb_type -= 5;
5522             goto decode_intra_mb;
5523         }
5524     } else {
5525         if(h->slice_type == FF_SI_TYPE && mb_type)
5526             mb_type--;
5527         assert(h->slice_type_nos == FF_I_TYPE);
5528 decode_intra_mb:
5529         partition_count = 0;
5530         cbp= i_mb_type_info[mb_type].cbp;
5531         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5532         mb_type= i_mb_type_info[mb_type].type;
5533     }
5534     if(MB_FIELD)
5535         mb_type |= MB_TYPE_INTERLACED;
5536
5537     h->slice_table[ mb_xy ]= h->slice_num;
5538
5539     if(IS_INTRA_PCM(mb_type)) {
5540         const uint8_t *ptr;
5541
5542         // We assume these blocks are very rare so we do not optimize it.
5543         // FIXME The two following lines get the bitstream position in the cabac
5544         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5545         ptr= h->cabac.bytestream;
5546         if(h->cabac.low&0x1) ptr--;
5547         if(CABAC_BITS==16){
5548             if(h->cabac.low&0x1FF) ptr--;
5549         }
5550
5551         // The pixels are stored in the same order as levels in h->mb array.
5552         memcpy(h->mb, ptr, 256); ptr+=256;
5553         if(CHROMA){
5554             memcpy(h->mb+128, ptr, 128); ptr+=128;
5555         }
5556
5557         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5558
5559         // All blocks are present
5560         h->cbp_table[mb_xy] = 0x1ef;
5561         h->chroma_pred_mode_table[mb_xy] = 0;
5562         // In deblocking, the quantizer is 0
5563         s->current_picture.qscale_table[mb_xy]= 0;
5564         // All coeffs are present
5565         memset(h->non_zero_count[mb_xy], 16, 16);
5566         s->current_picture.mb_type[mb_xy]= mb_type;
5567         h->last_qscale_diff = 0;
5568         return 0;
5569     }
5570
5571     if(MB_MBAFF){
5572         h->ref_count[0] <<= 1;
5573         h->ref_count[1] <<= 1;
5574     }
5575
5576     fill_caches(h, mb_type, 0);
5577
5578     if( IS_INTRA( mb_type ) ) {
5579         int i, pred_mode;
5580         if( IS_INTRA4x4( mb_type ) ) {
5581             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5582                 mb_type |= MB_TYPE_8x8DCT;
5583                 for( i = 0; i < 16; i+=4 ) {
5584                     int pred = pred_intra_mode( h, i );
5585                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5586                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5587                 }
5588             } else {
5589                 for( i = 0; i < 16; i++ ) {
5590                     int pred = pred_intra_mode( h, i );
5591                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5592
5593                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5594                 }
5595             }
5596             write_back_intra_pred_mode(h);
5597             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5598         } else {
5599             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5600             if( h->intra16x16_pred_mode < 0 ) return -1;
5601         }
5602         if(CHROMA){
5603             h->chroma_pred_mode_table[mb_xy] =
5604             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5605
5606             pred_mode= check_intra_pred_mode( h, pred_mode );
5607             if( pred_mode < 0 ) return -1;
5608             h->chroma_pred_mode= pred_mode;
5609         }
5610     } else if( partition_count == 4 ) {
5611         int i, j, sub_partition_count[4], list, ref[2][4];
5612
5613         if( h->slice_type_nos == FF_B_TYPE ) {
5614             for( i = 0; i < 4; i++ ) {
5615                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5616                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5617                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5618             }
5619             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5620                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5621                 pred_direct_motion(h, &mb_type);
5622                 h->ref_cache[0][scan8[4]] =
5623                 h->ref_cache[1][scan8[4]] =
5624                 h->ref_cache[0][scan8[12]] =
5625                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5626                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5627                     for( i = 0; i < 4; i++ )
5628                         if( IS_DIRECT(h->sub_mb_type[i]) )
5629                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5630                 }
5631             }
5632         } else {
5633             for( i = 0; i < 4; i++ ) {
5634                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5635                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5636                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5637             }
5638         }
5639
5640         for( list = 0; list < h->list_count; list++ ) {
5641                 for( i = 0; i < 4; i++ ) {
5642                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5643                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5644                         if( h->ref_count[list] > 1 )
5645                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5646                         else
5647                             ref[list][i] = 0;
5648                     } else {
5649                         ref[list][i] = -1;
5650                     }
5651                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5652                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5653                 }
5654         }
5655
5656         if(dct8x8_allowed)
5657             dct8x8_allowed = get_dct8x8_allowed(h);
5658
5659         for(list=0; list<h->list_count; list++){
5660             for(i=0; i<4; i++){
5661                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5662                 if(IS_DIRECT(h->sub_mb_type[i])){
5663                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5664                     continue;
5665                 }
5666
5667                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5668                     const int sub_mb_type= h->sub_mb_type[i];
5669                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5670                     for(j=0; j<sub_partition_count[i]; j++){
5671                         int mpx, mpy;
5672                         int mx, my;
5673                         const int index= 4*i + block_width*j;
5674                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5675                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5676                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5677
5678                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5679                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5680                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5681
5682                         if(IS_SUB_8X8(sub_mb_type)){
5683                             mv_cache[ 1 ][0]=
5684                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5685                             mv_cache[ 1 ][1]=
5686                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5687
5688                             mvd_cache[ 1 ][0]=
5689                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5690                             mvd_cache[ 1 ][1]=
5691                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5692                         }else if(IS_SUB_8X4(sub_mb_type)){
5693                             mv_cache[ 1 ][0]= mx;
5694                             mv_cache[ 1 ][1]= my;
5695
5696                             mvd_cache[ 1 ][0]= mx - mpx;
5697                             mvd_cache[ 1 ][1]= my - mpy;
5698                         }else if(IS_SUB_4X8(sub_mb_type)){
5699                             mv_cache[ 8 ][0]= mx;
5700                             mv_cache[ 8 ][1]= my;
5701
5702                             mvd_cache[ 8 ][0]= mx - mpx;
5703                             mvd_cache[ 8 ][1]= my - mpy;
5704                         }
5705                         mv_cache[ 0 ][0]= mx;
5706                         mv_cache[ 0 ][1]= my;
5707
5708                         mvd_cache[ 0 ][0]= mx - mpx;
5709                         mvd_cache[ 0 ][1]= my - mpy;
5710                     }
5711                 }else{
5712                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5713                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5714                     p[0] = p[1] = p[8] = p[9] = 0;
5715                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5716                 }
5717             }
5718         }
5719     } else if( IS_DIRECT(mb_type) ) {
5720         pred_direct_motion(h, &mb_type);
5721         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5722         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5723         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5724     } else {
5725         int list, mx, my, i, mpx, mpy;
5726         if(IS_16X16(mb_type)){
5727             for(list=0; list<h->list_count; list++){
5728                 if(IS_DIR(mb_type, 0, list)){
5729                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5730                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5731                 }else
5732                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5733             }
5734             for(list=0; list<h->list_count; list++){
5735                 if(IS_DIR(mb_type, 0, list)){
5736                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5737
5738                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5739                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5740                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5741
5742                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5743                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5744                 }else
5745                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5746             }
5747         }
5748         else if(IS_16X8(mb_type)){
5749             for(list=0; list<h->list_count; list++){
5750                     for(i=0; i<2; i++){
5751                         if(IS_DIR(mb_type, i, list)){
5752                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5753                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5754                         }else
5755                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5756                     }
5757             }
5758             for(list=0; list<h->list_count; list++){
5759                 for(i=0; i<2; i++){
5760                     if(IS_DIR(mb_type, i, list)){
5761                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5762                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5763                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5764                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5765
5766                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5767                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5768                     }else{
5769                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5770                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5771                     }
5772                 }
5773             }
5774         }else{
5775             assert(IS_8X16(mb_type));
5776             for(list=0; list<h->list_count; list++){
5777                     for(i=0; i<2; i++){
5778                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5779                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5780                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5781                         }else
5782                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5783                     }
5784             }
5785             for(list=0; list<h->list_count; list++){
5786                 for(i=0; i<2; i++){
5787                     if(IS_DIR(mb_type, i, list)){
5788                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5789                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5790                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5791
5792                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5793                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5794                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5795                     }else{
5796                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5797                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5798                     }
5799                 }
5800             }
5801         }
5802     }
5803
5804    if( IS_INTER( mb_type ) ) {
5805         h->chroma_pred_mode_table[mb_xy] = 0;
5806         write_back_motion( h, mb_type );
5807    }
5808
5809     if( !IS_INTRA16x16( mb_type ) ) {
5810         cbp  = decode_cabac_mb_cbp_luma( h );
5811         if(CHROMA)
5812             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5813     }
5814
5815     h->cbp_table[mb_xy] = h->cbp = cbp;
5816
5817     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5818         if( decode_cabac_mb_transform_size( h ) )
5819             mb_type |= MB_TYPE_8x8DCT;
5820     }
5821     s->current_picture.mb_type[mb_xy]= mb_type;
5822
5823     if( cbp || IS_INTRA16x16( mb_type ) ) {
5824         const uint8_t *scan, *scan8x8, *dc_scan;
5825         const uint32_t *qmul;
5826         int dqp;
5827
5828         if(IS_INTERLACED(mb_type)){
5829             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5830             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5831             dc_scan= luma_dc_field_scan;
5832         }else{
5833             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5834             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5835             dc_scan= luma_dc_zigzag_scan;
5836         }
5837
5838         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5839         if( dqp == INT_MIN ){
5840             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5841             return -1;
5842         }
5843         s->qscale += dqp;
5844         if(((unsigned)s->qscale) > 51){
5845             if(s->qscale<0) s->qscale+= 52;
5846             else            s->qscale-= 52;
5847         }
5848         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5849         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5850
5851         if( IS_INTRA16x16( mb_type ) ) {
5852             int i;
5853             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5854             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5855
5856             if( cbp&15 ) {
5857                 qmul = h->dequant4_coeff[0][s->qscale];
5858                 for( i = 0; i < 16; i++ ) {
5859                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5860                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5861                 }
5862             } else {
5863                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5864             }
5865         } else {
5866             int i8x8, i4x4;
5867             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5868                 if( cbp & (1<<i8x8) ) {
5869                     if( IS_8x8DCT(mb_type) ) {
5870                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5871                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5872                     } else {
5873                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5874                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5875                             const int index = 4*i8x8 + i4x4;
5876                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5877 //START_TIMER
5878                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5879 //STOP_TIMER("decode_residual")
5880                         }
5881                     }
5882                 } else {
5883                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5884                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5885                 }
5886             }
5887         }
5888
5889         if( cbp&0x30 ){
5890             int c;
5891             for( c = 0; c < 2; c++ ) {
5892                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5893                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5894             }
5895         }
5896
5897         if( cbp&0x20 ) {
5898             int c, i;
5899             for( c = 0; c < 2; c++ ) {
5900                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5901                 for( i = 0; i < 4; i++ ) {
5902                     const int index = 16 + 4 * c + i;
5903                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5904                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5905                 }
5906             }
5907         } else {
5908             uint8_t * const nnz= &h->non_zero_count_cache[0];
5909             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5910             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5911         }
5912     } else {
5913         uint8_t * const nnz= &h->non_zero_count_cache[0];
5914         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5915         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5916         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5917         h->last_qscale_diff = 0;
5918     }
5919
5920     s->current_picture.qscale_table[mb_xy]= s->qscale;
5921     write_back_non_zero_count(h);
5922
5923     if(MB_MBAFF){
5924         h->ref_count[0] >>= 1;
5925         h->ref_count[1] >>= 1;
5926     }
5927
5928     return 0;
5929 }
5930
5931
5932 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5933     int i, d;
5934     const int index_a = qp + h->slice_alpha_c0_offset;
5935     const int alpha = (alpha_table+52)[index_a];
5936     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5937
5938     if( bS[0] < 4 ) {
5939         int8_t tc[4];
5940         for(i=0; i<4; i++)
5941             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5942         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5943     } else {
5944         /* 16px edge length, because bS=4 is triggered by being at
5945          * the edge of an intra MB, so all 4 bS are the same */
5946             for( d = 0; d < 16; d++ ) {
5947                 const int p0 = pix[-1];
5948                 const int p1 = pix[-2];
5949                 const int p2 = pix[-3];
5950
5951                 const int q0 = pix[0];
5952                 const int q1 = pix[1];
5953                 const int q2 = pix[2];
5954
5955                 if( FFABS( p0 - q0 ) < alpha &&
5956                     FFABS( p1 - p0 ) < beta &&
5957                     FFABS( q1 - q0 ) < beta ) {
5958
5959                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5960                         if( FFABS( p2 - p0 ) < beta)
5961                         {
5962                             const int p3 = pix[-4];
5963                             /* p0', p1', p2' */
5964                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5965                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5966                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5967                         } else {
5968                             /* p0' */
5969                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5970                         }
5971                         if( FFABS( q2 - q0 ) < beta)
5972                         {
5973                             const int q3 = pix[3];
5974                             /* q0', q1', q2' */
5975                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5976                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5977                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5978                         } else {
5979                             /* q0' */
5980                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5981                         }
5982                     }else{
5983                         /* p0', q0' */
5984                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5985                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5986                     }
5987                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5988                 }
5989                 pix += stride;
5990             }
5991     }
5992 }
5993 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5994     int i;
5995     const int index_a = qp + h->slice_alpha_c0_offset;
5996     const int alpha = (alpha_table+52)[index_a];
5997     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5998
5999     if( bS[0] < 4 ) {
6000         int8_t tc[4];
6001         for(i=0; i<4; i++)
6002             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6003         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6004     } else {
6005         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6006     }
6007 }
6008
6009 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6010     int i;
6011     for( i = 0; i < 16; i++, pix += stride) {
6012         int index_a;
6013         int alpha;
6014         int beta;
6015
6016         int qp_index;
6017         int bS_index = (i >> 1);
6018         if (!MB_FIELD) {
6019             bS_index &= ~1;
6020             bS_index |= (i & 1);
6021         }
6022
6023         if( bS[bS_index] == 0 ) {
6024             continue;
6025         }
6026
6027         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6028         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6029         alpha = (alpha_table+52)[index_a];
6030         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6031
6032         if( bS[bS_index] < 4 ) {
6033             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6034             const int p0 = pix[-1];
6035             const int p1 = pix[-2];
6036             const int p2 = pix[-3];
6037             const int q0 = pix[0];
6038             const int q1 = pix[1];
6039             const int q2 = pix[2];
6040
6041             if( FFABS( p0 - q0 ) < alpha &&
6042                 FFABS( p1 - p0 ) < beta &&
6043                 FFABS( q1 - q0 ) < beta ) {
6044                 int tc = tc0;
6045                 int i_delta;
6046
6047                 if( FFABS( p2 - p0 ) < beta ) {
6048                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6049                     tc++;
6050                 }
6051                 if( FFABS( q2 - q0 ) < beta ) {
6052                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6053                     tc++;
6054                 }
6055
6056                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6057                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6058                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6059                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6060             }
6061         }else{
6062             const int p0 = pix[-1];
6063             const int p1 = pix[-2];
6064             const int p2 = pix[-3];
6065
6066             const int q0 = pix[0];
6067             const int q1 = pix[1];
6068             const int q2 = pix[2];
6069
6070             if( FFABS( p0 - q0 ) < alpha &&
6071                 FFABS( p1 - p0 ) < beta &&
6072                 FFABS( q1 - q0 ) < beta ) {
6073
6074                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6075                     if( FFABS( p2 - p0 ) < beta)
6076                     {
6077                         const int p3 = pix[-4];
6078                         /* p0', p1', p2' */
6079                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6080                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6081                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6082                     } else {
6083                         /* p0' */
6084                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6085                     }
6086                     if( FFABS( q2 - q0 ) < beta)
6087                     {
6088                         const int q3 = pix[3];
6089                         /* q0', q1', q2' */
6090                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6091                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6092                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6093                     } else {
6094                         /* q0' */
6095                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6096                     }
6097                 }else{
6098                     /* p0', q0' */
6099                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6100                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6101                 }
6102                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6103             }
6104         }
6105     }
6106 }
6107 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6108     int i;
6109     for( i = 0; i < 8; i++, pix += stride) {
6110         int index_a;
6111         int alpha;
6112         int beta;
6113
6114         int qp_index;
6115         int bS_index = i;
6116
6117         if( bS[bS_index] == 0 ) {
6118             continue;
6119         }
6120
6121         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6122         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6123         alpha = (alpha_table+52)[index_a];
6124         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6125
6126         if( bS[bS_index] < 4 ) {
6127             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6128             const int p0 = pix[-1];
6129             const int p1 = pix[-2];
6130             const int q0 = pix[0];
6131             const int q1 = pix[1];
6132
6133             if( FFABS( p0 - q0 ) < alpha &&
6134                 FFABS( p1 - p0 ) < beta &&
6135                 FFABS( q1 - q0 ) < beta ) {
6136                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6137
6138                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6139                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6140                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6141             }
6142         }else{
6143             const int p0 = pix[-1];
6144             const int p1 = pix[-2];
6145             const int q0 = pix[0];
6146             const int q1 = pix[1];
6147
6148             if( FFABS( p0 - q0 ) < alpha &&
6149                 FFABS( p1 - p0 ) < beta &&
6150                 FFABS( q1 - q0 ) < beta ) {
6151
6152                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6153                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6154                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6155             }
6156         }
6157     }
6158 }
6159
6160 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6161     int i, d;
6162     const int index_a = qp + h->slice_alpha_c0_offset;
6163     const int alpha = (alpha_table+52)[index_a];
6164     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6165     const int pix_next  = stride;
6166
6167     if( bS[0] < 4 ) {
6168         int8_t tc[4];
6169         for(i=0; i<4; i++)
6170             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6171         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6172     } else {
6173         /* 16px edge length, see filter_mb_edgev */
6174             for( d = 0; d < 16; d++ ) {
6175                 const int p0 = pix[-1*pix_next];
6176                 const int p1 = pix[-2*pix_next];
6177                 const int p2 = pix[-3*pix_next];
6178                 const int q0 = pix[0];
6179                 const int q1 = pix[1*pix_next];
6180                 const int q2 = pix[2*pix_next];
6181
6182                 if( FFABS( p0 - q0 ) < alpha &&
6183                     FFABS( p1 - p0 ) < beta &&
6184                     FFABS( q1 - q0 ) < beta ) {
6185
6186                     const int p3 = pix[-4*pix_next];
6187                     const int q3 = pix[ 3*pix_next];
6188
6189                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6190                         if( FFABS( p2 - p0 ) < beta) {
6191                             /* p0', p1', p2' */
6192                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6193                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6194                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6195                         } else {
6196                             /* p0' */
6197                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6198                         }
6199                         if( FFABS( q2 - q0 ) < beta) {
6200                             /* q0', q1', q2' */
6201                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6202                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6203                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6204                         } else {
6205                             /* q0' */
6206                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6207                         }
6208                     }else{
6209                         /* p0', q0' */
6210                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6211                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6212                     }
6213                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6214                 }
6215                 pix++;
6216             }
6217     }
6218 }
6219
6220 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6221     int i;
6222     const int index_a = qp + h->slice_alpha_c0_offset;
6223     const int alpha = (alpha_table+52)[index_a];
6224     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6225
6226     if( bS[0] < 4 ) {
6227         int8_t tc[4];
6228         for(i=0; i<4; i++)
6229             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6230         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6231     } else {
6232         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6233     }
6234 }
6235
6236 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6237     MpegEncContext * const s = &h->s;
6238     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6239     int mb_xy, mb_type;
6240     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6241
6242     mb_xy = h->mb_xy;
6243
6244     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6245 1 ||
6246        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6247                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6248         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6249         return;
6250     }
6251     assert(!FRAME_MBAFF);
6252
6253     mb_type = s->current_picture.mb_type[mb_xy];
6254     qp = s->current_picture.qscale_table[mb_xy];
6255     qp0 = s->current_picture.qscale_table[mb_xy-1];
6256     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6257     qpc = get_chroma_qp( h, 0, qp );
6258     qpc0 = get_chroma_qp( h, 0, qp0 );
6259     qpc1 = get_chroma_qp( h, 0, qp1 );
6260     qp0 = (qp + qp0 + 1) >> 1;
6261     qp1 = (qp + qp1 + 1) >> 1;
6262     qpc0 = (qpc + qpc0 + 1) >> 1;
6263     qpc1 = (qpc + qpc1 + 1) >> 1;
6264     qp_thresh = 15 - h->slice_alpha_c0_offset;
6265     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6266        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6267         return;
6268
6269     if( IS_INTRA(mb_type) ) {
6270         int16_t bS4[4] = {4,4,4,4};
6271         int16_t bS3[4] = {3,3,3,3};
6272         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6273         if( IS_8x8DCT(mb_type) ) {
6274             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6275             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6276             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6277             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6278         } else {
6279             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6280             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6281             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6282             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6283             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6284             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6285             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6286             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6287         }
6288         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6289         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6290         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6291         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6292         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6293         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6294         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6295         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6296         return;
6297     } else {
6298         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6299         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6300         int edges;
6301         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6302             edges = 4;
6303             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6304         } else {
6305             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6306                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6307             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6308                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6309                              ? 3 : 0;
6310             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6311             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6312             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6313                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6314         }
6315         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6316             bSv[0][0] = 0x0004000400040004ULL;
6317         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6318             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6319
6320 #define FILTER(hv,dir,edge)\
6321         if(bSv[dir][edge]) {\
6322             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6323             if(!(edge&1)) {\
6324                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6325                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6326             }\
6327         }
6328         if( edges == 1 ) {
6329             FILTER(v,0,0);
6330             FILTER(h,1,0);
6331         } else if( IS_8x8DCT(mb_type) ) {
6332             FILTER(v,0,0);
6333             FILTER(v,0,2);
6334             FILTER(h,1,0);
6335             FILTER(h,1,2);
6336         } else {
6337             FILTER(v,0,0);
6338             FILTER(v,0,1);
6339             FILTER(v,0,2);
6340             FILTER(v,0,3);
6341             FILTER(h,1,0);
6342             FILTER(h,1,1);
6343             FILTER(h,1,2);
6344             FILTER(h,1,3);
6345         }
6346 #undef FILTER
6347     }
6348 }
6349
6350 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6351     MpegEncContext * const s = &h->s;
6352     const int mb_xy= mb_x + mb_y*s->mb_stride;
6353     const int mb_type = s->current_picture.mb_type[mb_xy];
6354     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6355     int first_vertical_edge_done = 0;
6356     int dir;
6357
6358     //for sufficiently low qp, filtering wouldn't do anything
6359     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6360     if(!FRAME_MBAFF){
6361         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6362         int qp = s->current_picture.qscale_table[mb_xy];
6363         if(qp <= qp_thresh
6364            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6365            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6366             return;
6367         }
6368     }
6369
6370     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6371     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6372         int top_type, left_type[2];
6373         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6374         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6375         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6376
6377         if(IS_8x8DCT(top_type)){
6378             h->non_zero_count_cache[4+8*0]=
6379             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6380             h->non_zero_count_cache[6+8*0]=
6381             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6382         }
6383         if(IS_8x8DCT(left_type[0])){
6384             h->non_zero_count_cache[3+8*1]=
6385             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6386         }
6387         if(IS_8x8DCT(left_type[1])){
6388             h->non_zero_count_cache[3+8*3]=
6389             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6390         }
6391
6392         if(IS_8x8DCT(mb_type)){
6393             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6394             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6395
6396             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6397             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6398
6399             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6400             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6401
6402             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6403             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6404         }
6405     }
6406
6407     if (FRAME_MBAFF
6408             // left mb is in picture
6409             && h->slice_table[mb_xy-1] != 0xFFFF
6410             // and current and left pair do not have the same interlaced type
6411             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6412             // and left mb is in the same slice if deblocking_filter == 2
6413             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6414         /* First vertical edge is different in MBAFF frames
6415          * There are 8 different bS to compute and 2 different Qp
6416          */
6417         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6418         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6419         int16_t bS[8];
6420         int qp[2];
6421         int bqp[2];
6422         int rqp[2];
6423         int mb_qp, mbn0_qp, mbn1_qp;
6424         int i;
6425         first_vertical_edge_done = 1;
6426
6427         if( IS_INTRA(mb_type) )
6428             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6429         else {
6430             for( i = 0; i < 8; i++ ) {
6431                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6432
6433                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6434                     bS[i] = 4;
6435                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6436                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6437                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6438                                                                        :
6439                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6440                     bS[i] = 2;
6441                 else
6442                     bS[i] = 1;
6443             }
6444         }
6445
6446         mb_qp = s->current_picture.qscale_table[mb_xy];
6447         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6448         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6449         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6450         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6451                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6452         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6453                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6454         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6455         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6456                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6457         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6458                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6459
6460         /* Filter edge */
6461         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6462         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6463         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6464         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6465         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6466     }
6467     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6468     for( dir = 0; dir < 2; dir++ )
6469     {
6470         int edge;
6471         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6472         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6473         int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6474         int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6475         int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6476
6477         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6478                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6479         // how often to recheck mv-based bS when iterating between edges
6480         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6481                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6482         // how often to recheck mv-based bS when iterating along each edge
6483         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6484
6485         if (first_vertical_edge_done) {
6486             start = 1;
6487             first_vertical_edge_done = 0;
6488         }
6489
6490         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6491             start = 1;
6492
6493         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6494             && !IS_INTERLACED(mb_type)
6495             && IS_INTERLACED(mbm_type)
6496             ) {
6497             // This is a special case in the norm where the filtering must
6498             // be done twice (one each of the field) even if we are in a
6499             // frame macroblock.
6500             //
6501             static const int nnz_idx[4] = {4,5,6,3};
6502             unsigned int tmp_linesize   = 2 *   linesize;
6503             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6504             int mbn_xy = mb_xy - 2 * s->mb_stride;
6505             int qp;
6506             int i, j;
6507             int16_t bS[4];
6508
6509             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6510                 if( IS_INTRA(mb_type) ||
6511                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6512                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6513                 } else {
6514                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6515                     for( i = 0; i < 4; i++ ) {
6516                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6517                             mbn_nnz[nnz_idx[i]] != 0 )
6518                             bS[i] = 2;
6519                         else
6520                             bS[i] = 1;
6521                     }
6522                 }
6523                 // Do not use s->qscale as luma quantizer because it has not the same
6524                 // value in IPCM macroblocks.
6525                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6526                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6527                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6528                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6529                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6530                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6531                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6532                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6533             }
6534
6535             start = 1;
6536         }
6537
6538         /* Calculate bS */
6539         for( edge = start; edge < edges; edge++ ) {
6540             /* mbn_xy: neighbor macroblock */
6541             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6542             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6543             int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6544             int16_t bS[4];
6545             int qp;
6546
6547             if( (edge&1) && IS_8x8DCT(mb_type) )
6548                 continue;
6549
6550             if( IS_INTRA(mb_type) ||
6551                 IS_INTRA(mbn_type) ) {
6552                 int value;
6553                 if (edge == 0) {
6554                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6555                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6556                     ) {
6557                         value = 4;
6558                     } else {
6559                         value = 3;
6560                     }
6561                 } else {
6562                     value = 3;
6563                 }
6564                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6565             } else {
6566                 int i, l;
6567                 int mv_done;
6568
6569                 if( edge & mask_edge ) {
6570                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6571                     mv_done = 1;
6572                 }
6573                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6574                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6575                     mv_done = 1;
6576                 }
6577                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6578                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6579                     int bn_idx= b_idx - (dir ? 8:1);
6580                     int v = 0;
6581
6582                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6583                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6584                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6585                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6586                     }
6587
6588                     if(h->slice_type_nos == FF_B_TYPE && v){
6589                         v=0;
6590                         for( l = 0; !v && l < 2; l++ ) {
6591                             int ln= 1-l;
6592                             v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6593                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6594                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6595                         }
6596                     }
6597
6598                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6599                     mv_done = 1;
6600                 }
6601                 else
6602                     mv_done = 0;
6603
6604                 for( i = 0; i < 4; i++ ) {
6605                     int x = dir == 0 ? edge : i;
6606                     int y = dir == 0 ? i    : edge;
6607                     int b_idx= 8 + 4 + x + 8*y;
6608                     int bn_idx= b_idx - (dir ? 8:1);
6609
6610                     if( h->non_zero_count_cache[b_idx] != 0 ||
6611                         h->non_zero_count_cache[bn_idx] != 0 ) {
6612                         bS[i] = 2;
6613                     }
6614                     else if(!mv_done)
6615                     {
6616                         bS[i] = 0;
6617                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6618                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6619                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6620                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6621                                 bS[i] = 1;
6622                                 break;
6623                             }
6624                         }
6625
6626                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6627                             bS[i] = 0;
6628                             for( l = 0; l < 2; l++ ) {
6629                                 int ln= 1-l;
6630                                 if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6631                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6632                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6633                                     bS[i] = 1;
6634                                     break;
6635                                 }
6636                             }
6637                         }
6638                     }
6639                 }
6640
6641                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6642                     continue;
6643             }
6644
6645             /* Filter edge */
6646             // Do not use s->qscale as luma quantizer because it has not the same
6647             // value in IPCM macroblocks.
6648             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6649             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6650             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6651             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6652             if( dir == 0 ) {
6653                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6654                 if( (edge&1) == 0 ) {
6655                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6656                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6657                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6658                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6659                 }
6660             } else {
6661                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6662                 if( (edge&1) == 0 ) {
6663                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6664                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6665                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6666                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6667                 }
6668             }
6669         }
6670     }
6671 }
6672
6673 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6674     H264Context *h = *(void**)arg;
6675     MpegEncContext * const s = &h->s;
6676     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6677
6678     s->mb_skip_run= -1;
6679
6680     if( h->pps.cabac ) {
6681         int i;
6682
6683         /* realign */
6684         align_get_bits( &s->gb );
6685
6686         /* init cabac */
6687         ff_init_cabac_states( &h->cabac);
6688         ff_init_cabac_decoder( &h->cabac,
6689                                s->gb.buffer + get_bits_count(&s->gb)/8,
6690                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6691         /* calculate pre-state */
6692         for( i= 0; i < 460; i++ ) {
6693             int pre;
6694             if( h->slice_type_nos == FF_I_TYPE )
6695                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6696             else
6697                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6698
6699             if( pre <= 63 )
6700                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6701             else
6702                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6703         }
6704
6705         for(;;){
6706 //START_TIMER
6707             int ret = decode_mb_cabac(h);
6708             int eos;
6709 //STOP_TIMER("decode_mb_cabac")
6710
6711             if(ret>=0) hl_decode_mb(h);
6712
6713             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6714                 s->mb_y++;
6715
6716                 if(ret>=0) ret = decode_mb_cabac(h);
6717
6718                 if(ret>=0) hl_decode_mb(h);
6719                 s->mb_y--;
6720             }
6721             eos = get_cabac_terminate( &h->cabac );
6722
6723             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6724                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6725                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6726                 return -1;
6727             }
6728
6729             if( ++s->mb_x >= s->mb_width ) {
6730                 s->mb_x = 0;
6731                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6732                 ++s->mb_y;
6733                 if(FIELD_OR_MBAFF_PICTURE) {
6734                     ++s->mb_y;
6735                 }
6736             }
6737
6738             if( eos || s->mb_y >= s->mb_height ) {
6739                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6740                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6741                 return 0;
6742             }
6743         }
6744
6745     } else {
6746         for(;;){
6747             int ret = decode_mb_cavlc(h);
6748
6749             if(ret>=0) hl_decode_mb(h);
6750
6751             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6752                 s->mb_y++;
6753                 ret = decode_mb_cavlc(h);
6754
6755                 if(ret>=0) hl_decode_mb(h);
6756                 s->mb_y--;
6757             }
6758
6759             if(ret<0){
6760                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6761                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6762
6763                 return -1;
6764             }
6765
6766             if(++s->mb_x >= s->mb_width){
6767                 s->mb_x=0;
6768                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6769                 ++s->mb_y;
6770                 if(FIELD_OR_MBAFF_PICTURE) {
6771                     ++s->mb_y;
6772                 }
6773                 if(s->mb_y >= s->mb_height){
6774                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6775
6776                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6777                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6778
6779                         return 0;
6780                     }else{
6781                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6782
6783                         return -1;
6784                     }
6785                 }
6786             }
6787
6788             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6789                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6790                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6791                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6792
6793                     return 0;
6794                 }else{
6795                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6796
6797                     return -1;
6798                 }
6799             }
6800         }
6801     }
6802
6803 #if 0
6804     for(;s->mb_y < s->mb_height; s->mb_y++){
6805         for(;s->mb_x < s->mb_width; s->mb_x++){
6806             int ret= decode_mb(h);
6807
6808             hl_decode_mb(h);
6809
6810             if(ret<0){
6811                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6812                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6813
6814                 return -1;
6815             }
6816
6817             if(++s->mb_x >= s->mb_width){
6818                 s->mb_x=0;
6819                 if(++s->mb_y >= s->mb_height){
6820                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6821                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6822
6823                         return 0;
6824                     }else{
6825                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6826
6827                         return -1;
6828                     }
6829                 }
6830             }
6831
6832             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6833                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6834                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6835
6836                     return 0;
6837                 }else{
6838                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6839
6840                     return -1;
6841                 }
6842             }
6843         }
6844         s->mb_x=0;
6845         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6846     }
6847 #endif
6848     return -1; //not reached
6849 }
6850
6851 static int decode_picture_timing(H264Context *h){
6852     MpegEncContext * const s = &h->s;
6853     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6854         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6855         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6856     }
6857     if(h->sps.pic_struct_present_flag){
6858         unsigned int i, num_clock_ts;
6859         h->sei_pic_struct = get_bits(&s->gb, 4);
6860
6861         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6862             return -1;
6863
6864         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6865
6866         for (i = 0 ; i < num_clock_ts ; i++){
6867             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6868                 unsigned int full_timestamp_flag;
6869                 skip_bits(&s->gb, 2);                 /* ct_type */
6870                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6871                 skip_bits(&s->gb, 5);                 /* counting_type */
6872                 full_timestamp_flag = get_bits(&s->gb, 1);
6873                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6874                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6875                 skip_bits(&s->gb, 8);                 /* n_frames */
6876                 if(full_timestamp_flag){
6877                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6878                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6879                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6880                 }else{
6881                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6882                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6883                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6884                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6885                             if(get_bits(&s->gb, 1))   /* hours_flag */
6886                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6887                         }
6888                     }
6889                 }
6890                 if(h->sps.time_offset_length > 0)
6891                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6892             }
6893         }
6894     }
6895     return 0;
6896 }
6897
6898 static int decode_unregistered_user_data(H264Context *h, int size){
6899     MpegEncContext * const s = &h->s;
6900     uint8_t user_data[16+256];
6901     int e, build, i;
6902
6903     if(size<16)
6904         return -1;
6905
6906     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6907         user_data[i]= get_bits(&s->gb, 8);
6908     }
6909
6910     user_data[i]= 0;
6911     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6912     if(e==1 && build>=0)
6913         h->x264_build= build;
6914
6915     if(s->avctx->debug & FF_DEBUG_BUGS)
6916         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6917
6918     for(; i<size; i++)
6919         skip_bits(&s->gb, 8);
6920
6921     return 0;
6922 }
6923
6924 static int decode_sei(H264Context *h){
6925     MpegEncContext * const s = &h->s;
6926
6927     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6928         int size, type;
6929
6930         type=0;
6931         do{
6932             type+= show_bits(&s->gb, 8);
6933         }while(get_bits(&s->gb, 8) == 255);
6934
6935         size=0;
6936         do{
6937             size+= show_bits(&s->gb, 8);
6938         }while(get_bits(&s->gb, 8) == 255);
6939
6940         switch(type){
6941         case 1: // Picture timing SEI
6942             if(decode_picture_timing(h) < 0)
6943                 return -1;
6944             break;
6945         case 5:
6946             if(decode_unregistered_user_data(h, size) < 0)
6947                 return -1;
6948             break;
6949         default:
6950             skip_bits(&s->gb, 8*size);
6951         }
6952
6953         //FIXME check bits here
6954         align_get_bits(&s->gb);
6955     }
6956
6957     return 0;
6958 }
6959
6960 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6961     MpegEncContext * const s = &h->s;
6962     int cpb_count, i;
6963     cpb_count = get_ue_golomb(&s->gb) + 1;
6964     get_bits(&s->gb, 4); /* bit_rate_scale */
6965     get_bits(&s->gb, 4); /* cpb_size_scale */
6966     for(i=0; i<cpb_count; i++){
6967         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6968         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6969         get_bits1(&s->gb);     /* cbr_flag */
6970     }
6971     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6972     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6973     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6974     sps->time_offset_length = get_bits(&s->gb, 5);
6975 }
6976
6977 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6978     MpegEncContext * const s = &h->s;
6979     int aspect_ratio_info_present_flag;
6980     unsigned int aspect_ratio_idc;
6981
6982     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6983
6984     if( aspect_ratio_info_present_flag ) {
6985         aspect_ratio_idc= get_bits(&s->gb, 8);
6986         if( aspect_ratio_idc == EXTENDED_SAR ) {
6987             sps->sar.num= get_bits(&s->gb, 16);
6988             sps->sar.den= get_bits(&s->gb, 16);
6989         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6990             sps->sar=  pixel_aspect[aspect_ratio_idc];
6991         }else{
6992             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6993             return -1;
6994         }
6995     }else{
6996         sps->sar.num=
6997         sps->sar.den= 0;
6998     }
6999 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7000
7001     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7002         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7003     }
7004
7005     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7006         get_bits(&s->gb, 3);    /* video_format */
7007         get_bits1(&s->gb);      /* video_full_range_flag */
7008         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7009             get_bits(&s->gb, 8); /* colour_primaries */
7010             get_bits(&s->gb, 8); /* transfer_characteristics */
7011             get_bits(&s->gb, 8); /* matrix_coefficients */
7012         }
7013     }
7014
7015     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7016         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7017         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7018     }
7019
7020     sps->timing_info_present_flag = get_bits1(&s->gb);
7021     if(sps->timing_info_present_flag){
7022         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7023         sps->time_scale = get_bits_long(&s->gb, 32);
7024         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7025     }
7026
7027     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7028     if(sps->nal_hrd_parameters_present_flag)
7029         decode_hrd_parameters(h, sps);
7030     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7031     if(sps->vcl_hrd_parameters_present_flag)
7032         decode_hrd_parameters(h, sps);
7033     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7034         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7035     sps->pic_struct_present_flag = get_bits1(&s->gb);
7036
7037     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7038     if(sps->bitstream_restriction_flag){
7039         unsigned int num_reorder_frames;
7040         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7041         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7042         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7043         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7044         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7045         num_reorder_frames= get_ue_golomb(&s->gb);
7046         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7047
7048         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7049             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7050             return -1;
7051         }
7052
7053         sps->num_reorder_frames= num_reorder_frames;
7054     }
7055
7056     return 0;
7057 }
7058
7059 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7060                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7061     MpegEncContext * const s = &h->s;
7062     int i, last = 8, next = 8;
7063     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7064     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7065         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7066     else
7067     for(i=0;i<size;i++){
7068         if(next)
7069             next = (last + get_se_golomb(&s->gb)) & 0xff;
7070         if(!i && !next){ /* matrix not written, we use the preset one */
7071             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7072             break;
7073         }
7074         last = factors[scan[i]] = next ? next : last;
7075     }
7076 }
7077
7078 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7079                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7080     MpegEncContext * const s = &h->s;
7081     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7082     const uint8_t *fallback[4] = {
7083         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7084         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7085         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7086         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7087     };
7088     if(get_bits1(&s->gb)){
7089         sps->scaling_matrix_present |= is_sps;
7090         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7091         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7092         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7093         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7094         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7095         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7096         if(is_sps || pps->transform_8x8_mode){
7097             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7098             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7099         }
7100     }
7101 }
7102
7103 /**
7104  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7105  */
7106 static void *
7107 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7108                     const size_t size, const char *name)
7109 {
7110     if(id>=max) {
7111         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7112         return NULL;
7113     }
7114
7115     if(!vec[id]) {
7116         vec[id] = av_mallocz(size);
7117         if(vec[id] == NULL)
7118             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7119     }
7120     return vec[id];
7121 }
7122
7123 static inline int decode_seq_parameter_set(H264Context *h){
7124     MpegEncContext * const s = &h->s;
7125     int profile_idc, level_idc;
7126     unsigned int sps_id, tmp, mb_width, mb_height;
7127     int i;
7128     SPS *sps;
7129
7130     profile_idc= get_bits(&s->gb, 8);
7131     get_bits1(&s->gb);   //constraint_set0_flag
7132     get_bits1(&s->gb);   //constraint_set1_flag
7133     get_bits1(&s->gb);   //constraint_set2_flag
7134     get_bits1(&s->gb);   //constraint_set3_flag
7135     get_bits(&s->gb, 4); // reserved
7136     level_idc= get_bits(&s->gb, 8);
7137     sps_id= get_ue_golomb(&s->gb);
7138
7139     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7140     if(sps == NULL)
7141         return -1;
7142
7143     sps->profile_idc= profile_idc;
7144     sps->level_idc= level_idc;
7145
7146     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7147     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7148     sps->scaling_matrix_present = 0;
7149
7150     if(sps->profile_idc >= 100){ //high profile
7151         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7152         if(sps->chroma_format_idc == 3)
7153             get_bits1(&s->gb);  //residual_color_transform_flag
7154         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7155         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7156         sps->transform_bypass = get_bits1(&s->gb);
7157         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7158     }else{
7159         sps->chroma_format_idc= 1;
7160     }
7161
7162     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7163     sps->poc_type= get_ue_golomb(&s->gb);
7164
7165     if(sps->poc_type == 0){ //FIXME #define
7166         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7167     } else if(sps->poc_type == 1){//FIXME #define
7168         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7169         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7170         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7171         tmp= get_ue_golomb(&s->gb);
7172
7173         if(tmp >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7174             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7175             return -1;
7176         }
7177         sps->poc_cycle_length= tmp;
7178
7179         for(i=0; i<sps->poc_cycle_length; i++)
7180             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7181     }else if(sps->poc_type != 2){
7182         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7183         return -1;
7184     }
7185
7186     tmp= get_ue_golomb(&s->gb);
7187     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7188         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7189         return -1;
7190     }
7191     sps->ref_frame_count= tmp;
7192     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7193     mb_width= get_ue_golomb(&s->gb) + 1;
7194     mb_height= get_ue_golomb(&s->gb) + 1;
7195     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7196        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7197         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7198         return -1;
7199     }
7200     sps->mb_width = mb_width;
7201     sps->mb_height= mb_height;
7202
7203     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7204     if(!sps->frame_mbs_only_flag)
7205         sps->mb_aff= get_bits1(&s->gb);
7206     else
7207         sps->mb_aff= 0;
7208
7209     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7210
7211 #ifndef ALLOW_INTERLACE
7212     if(sps->mb_aff)
7213         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7214 #endif
7215     sps->crop= get_bits1(&s->gb);
7216     if(sps->crop){
7217         sps->crop_left  = get_ue_golomb(&s->gb);
7218         sps->crop_right = get_ue_golomb(&s->gb);
7219         sps->crop_top   = get_ue_golomb(&s->gb);
7220         sps->crop_bottom= get_ue_golomb(&s->gb);
7221         if(sps->crop_left || sps->crop_top){
7222             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7223         }
7224         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7225             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7226         }
7227     }else{
7228         sps->crop_left  =
7229         sps->crop_right =
7230         sps->crop_top   =
7231         sps->crop_bottom= 0;
7232     }
7233
7234     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7235     if( sps->vui_parameters_present_flag )
7236         decode_vui_parameters(h, sps);
7237
7238     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7239         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7240                sps_id, sps->profile_idc, sps->level_idc,
7241                sps->poc_type,
7242                sps->ref_frame_count,
7243                sps->mb_width, sps->mb_height,
7244                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7245                sps->direct_8x8_inference_flag ? "8B8" : "",
7246                sps->crop_left, sps->crop_right,
7247                sps->crop_top, sps->crop_bottom,
7248                sps->vui_parameters_present_flag ? "VUI" : "",
7249                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7250                );
7251     }
7252
7253     if (decode_postinit(h, sps) < 0)
7254         return -1;
7255
7256     return 0;
7257 }
7258
7259 static void
7260 build_qp_table(PPS *pps, int t, int index)
7261 {
7262     int i;
7263     for(i = 0; i < 52; i++)
7264         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7265 }
7266
7267 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7268     MpegEncContext * const s = &h->s;
7269     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7270     PPS *pps;
7271
7272     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7273     if(pps == NULL)
7274         return -1;
7275
7276     tmp= get_ue_golomb(&s->gb);
7277     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7278         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7279         return -1;
7280     }
7281     pps->sps_id= tmp;
7282
7283     pps->cabac= get_bits1(&s->gb);
7284     pps->pic_order_present= get_bits1(&s->gb);
7285     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7286     if(pps->slice_group_count > 1 ){
7287         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7288         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7289         switch(pps->mb_slice_group_map_type){
7290         case 0:
7291 #if 0
7292 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7293 |    run_length[ i ]                                |1  |ue(v)   |
7294 #endif
7295             break;
7296         case 2:
7297 #if 0
7298 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7299 |{                                                  |   |        |
7300 |    top_left_mb[ i ]                               |1  |ue(v)   |
7301 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7302 |   }                                               |   |        |
7303 #endif
7304             break;
7305         case 3:
7306         case 4:
7307         case 5:
7308 #if 0
7309 |   slice_group_change_direction_flag               |1  |u(1)    |
7310 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7311 #endif
7312             break;
7313         case 6:
7314 #if 0
7315 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7316 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7317 |)                                                  |   |        |
7318 |    slice_group_id[ i ]                            |1  |u(v)    |
7319 #endif
7320             break;
7321         }
7322     }
7323     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7324     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7325     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7326         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7327         pps->ref_count[0]= pps->ref_count[1]= 1;
7328         return -1;
7329     }
7330
7331     pps->weighted_pred= get_bits1(&s->gb);
7332     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7333     pps->init_qp= get_se_golomb(&s->gb) + 26;
7334     pps->init_qs= get_se_golomb(&s->gb) + 26;
7335     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7336     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7337     pps->constrained_intra_pred= get_bits1(&s->gb);
7338     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7339
7340     pps->transform_8x8_mode= 0;
7341     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7342     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7343     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7344
7345     if(get_bits_count(&s->gb) < bit_length){
7346         pps->transform_8x8_mode= get_bits1(&s->gb);
7347         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7348         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7349     } else {
7350         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7351     }
7352
7353     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7354     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7355     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7356         h->pps.chroma_qp_diff= 1;
7357
7358     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7359         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7360                pps_id, pps->sps_id,
7361                pps->cabac ? "CABAC" : "CAVLC",
7362                pps->slice_group_count,
7363                pps->ref_count[0], pps->ref_count[1],
7364                pps->weighted_pred ? "weighted" : "",
7365                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7366                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7367                pps->constrained_intra_pred ? "CONSTR" : "",
7368                pps->redundant_pic_cnt_present ? "REDU" : "",
7369                pps->transform_8x8_mode ? "8x8DCT" : ""
7370                );
7371     }
7372
7373     return 0;
7374 }
7375
7376 /**
7377  * Call decode_slice() for each context.
7378  *
7379  * @param h h264 master context
7380  * @param context_count number of contexts to execute
7381  */
7382 static void execute_decode_slices(H264Context *h, int context_count){
7383     MpegEncContext * const s = &h->s;
7384     AVCodecContext * const avctx= s->avctx;
7385     H264Context *hx;
7386     int i;
7387
7388     if(avctx->vdpau_acceleration) {
7389         return;
7390     } else
7391     if(context_count == 1) {
7392         decode_slice(avctx, &h);
7393     } else {
7394         for(i = 1; i < context_count; i++) {
7395             hx = h->thread_context[i];
7396             hx->s.error_recognition = avctx->error_recognition;
7397             hx->s.error_count = 0;
7398         }
7399
7400         avctx->execute(avctx, (void *)decode_slice,
7401                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7402
7403         /* pull back stuff from slices to master context */
7404         hx = h->thread_context[context_count - 1];
7405         s->mb_x = hx->s.mb_x;
7406         s->mb_y = hx->s.mb_y;
7407         s->dropable = hx->s.dropable;
7408         s->picture_structure = hx->s.picture_structure;
7409         for(i = 1; i < context_count; i++)
7410             h->s.error_count += h->thread_context[i]->s.error_count;
7411     }
7412 }
7413
7414
7415 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7416     MpegEncContext * const s = &h->s;
7417     AVCodecContext * const avctx= s->avctx;
7418     int buf_index=0;
7419     H264Context *hx; ///< thread context
7420     int context_count = 0;
7421
7422     h->max_contexts = avctx->thread_count;
7423 #if 0
7424     int i;
7425     for(i=0; i<50; i++){
7426         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7427     }
7428 #endif
7429     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7430         h->current_slice = 0;
7431         if (!s->first_field)
7432             s->current_picture_ptr= NULL;
7433     }
7434
7435     for(;;){
7436         int consumed;
7437         int dst_length;
7438         int bit_length;
7439         const uint8_t *ptr;
7440         int i, nalsize = 0;
7441         int err;
7442
7443         if(h->is_avc) {
7444             if(buf_index >= buf_size) break;
7445             nalsize = 0;
7446             for(i = 0; i < h->nal_length_size; i++)
7447                 nalsize = (nalsize << 8) | buf[buf_index++];
7448             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7449                 if(nalsize == 1){
7450                     buf_index++;
7451                     continue;
7452                 }else{
7453                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7454                     break;
7455                 }
7456             }
7457         } else {
7458             // start code prefix search
7459             for(; buf_index + 3 < buf_size; buf_index++){
7460                 // This should always succeed in the first iteration.
7461                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7462                     break;
7463             }
7464
7465             if(buf_index+3 >= buf_size) break;
7466
7467             buf_index+=3;
7468         }
7469
7470         hx = h->thread_context[context_count];
7471
7472         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7473         if (ptr==NULL || dst_length < 0){
7474             return -1;
7475         }
7476         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7477             dst_length--;
7478         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7479
7480         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7481             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7482         }
7483
7484         if (h->is_avc && (nalsize != consumed)){
7485             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7486             consumed= nalsize;
7487         }
7488
7489         buf_index += consumed;
7490
7491         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7492            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7493             continue;
7494
7495       again:
7496         err = 0;
7497         switch(hx->nal_unit_type){
7498         case NAL_IDR_SLICE:
7499             if (h->nal_unit_type != NAL_IDR_SLICE) {
7500                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7501                 return -1;
7502             }
7503             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7504         case NAL_SLICE:
7505             init_get_bits(&hx->s.gb, ptr, bit_length);
7506             hx->intra_gb_ptr=
7507             hx->inter_gb_ptr= &hx->s.gb;
7508             hx->s.data_partitioning = 0;
7509
7510             if((err = decode_slice_header(hx, h)))
7511                break;
7512
7513             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7514             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7515                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7516                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7517                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7518                && avctx->skip_frame < AVDISCARD_ALL)
7519                 context_count++;
7520             break;
7521         case NAL_DPA:
7522             init_get_bits(&hx->s.gb, ptr, bit_length);
7523             hx->intra_gb_ptr=
7524             hx->inter_gb_ptr= NULL;
7525             hx->s.data_partitioning = 1;
7526
7527             err = decode_slice_header(hx, h);
7528             break;
7529         case NAL_DPB:
7530             init_get_bits(&hx->intra_gb, ptr, bit_length);
7531             hx->intra_gb_ptr= &hx->intra_gb;
7532             break;
7533         case NAL_DPC:
7534             init_get_bits(&hx->inter_gb, ptr, bit_length);
7535             hx->inter_gb_ptr= &hx->inter_gb;
7536
7537             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7538                && s->context_initialized
7539                && s->hurry_up < 5
7540                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7541                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7542                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7543                && avctx->skip_frame < AVDISCARD_ALL)
7544                 context_count++;
7545             break;
7546         case NAL_SEI:
7547             init_get_bits(&s->gb, ptr, bit_length);
7548             decode_sei(h);
7549             break;
7550         case NAL_SPS:
7551             init_get_bits(&s->gb, ptr, bit_length);
7552             decode_seq_parameter_set(h);
7553
7554             if(s->flags& CODEC_FLAG_LOW_DELAY)
7555                 s->low_delay=1;
7556
7557             if(avctx->has_b_frames < 2)
7558                 avctx->has_b_frames= !s->low_delay;
7559             break;
7560         case NAL_PPS:
7561             init_get_bits(&s->gb, ptr, bit_length);
7562
7563             decode_picture_parameter_set(h, bit_length);
7564
7565             break;
7566         case NAL_AUD:
7567         case NAL_END_SEQUENCE:
7568         case NAL_END_STREAM:
7569         case NAL_FILLER_DATA:
7570         case NAL_SPS_EXT:
7571         case NAL_AUXILIARY_SLICE:
7572             break;
7573         default:
7574             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7575         }
7576
7577         if(context_count == h->max_contexts) {
7578             execute_decode_slices(h, context_count);
7579             context_count = 0;
7580         }
7581
7582         if (err < 0)
7583             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7584         else if(err == 1) {
7585             /* Slice could not be decoded in parallel mode, copy down
7586              * NAL unit stuff to context 0 and restart. Note that
7587              * rbsp_buffer is not transferred, but since we no longer
7588              * run in parallel mode this should not be an issue. */
7589             h->nal_unit_type = hx->nal_unit_type;
7590             h->nal_ref_idc   = hx->nal_ref_idc;
7591             hx = h;
7592             goto again;
7593         }
7594     }
7595     if(context_count)
7596         execute_decode_slices(h, context_count);
7597     return buf_index;
7598 }
7599
7600 /**
7601  * returns the number of bytes consumed for building the current frame
7602  */
7603 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7604         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7605         if(pos+10>buf_size) pos=buf_size; // oops ;)
7606
7607         return pos;
7608 }
7609
7610 static int decode_frame(AVCodecContext *avctx,
7611                              void *data, int *data_size,
7612                              const uint8_t *buf, int buf_size)
7613 {
7614     H264Context *h = avctx->priv_data;
7615     MpegEncContext *s = &h->s;
7616     AVFrame *pict = data;
7617     int buf_index;
7618
7619     s->flags= avctx->flags;
7620     s->flags2= avctx->flags2;
7621
7622    /* end of stream, output what is still in the buffers */
7623     if (buf_size == 0) {
7624         Picture *out;
7625         int i, out_idx;
7626
7627 //FIXME factorize this with the output code below
7628         out = h->delayed_pic[0];
7629         out_idx = 0;
7630         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7631             if(h->delayed_pic[i]->poc < out->poc){
7632                 out = h->delayed_pic[i];
7633                 out_idx = i;
7634             }
7635
7636         for(i=out_idx; h->delayed_pic[i]; i++)
7637             h->delayed_pic[i] = h->delayed_pic[i+1];
7638
7639         if(out){
7640             *data_size = sizeof(AVFrame);
7641             *pict= *(AVFrame*)out;
7642         }
7643
7644         return 0;
7645     }
7646
7647     if(h->is_avc && !h->got_avcC) {
7648         int i, cnt, nalsize;
7649         unsigned char *p = avctx->extradata;
7650         if(avctx->extradata_size < 7) {
7651             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7652             return -1;
7653         }
7654         if(*p != 1) {
7655             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7656             return -1;
7657         }
7658         /* sps and pps in the avcC always have length coded with 2 bytes,
7659            so put a fake nal_length_size = 2 while parsing them */
7660         h->nal_length_size = 2;
7661         // Decode sps from avcC
7662         cnt = *(p+5) & 0x1f; // Number of sps
7663         p += 6;
7664         for (i = 0; i < cnt; i++) {
7665             nalsize = AV_RB16(p) + 2;
7666             if(decode_nal_units(h, p, nalsize) < 0) {
7667                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7668                 return -1;
7669             }
7670             p += nalsize;
7671         }
7672         // Decode pps from avcC
7673         cnt = *(p++); // Number of pps
7674         for (i = 0; i < cnt; i++) {
7675             nalsize = AV_RB16(p) + 2;
7676             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7677                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7678                 return -1;
7679             }
7680             p += nalsize;
7681         }
7682         // Now store right nal length size, that will be use to parse all other nals
7683         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7684         // Do not reparse avcC
7685         h->got_avcC = 1;
7686     }
7687
7688     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7689         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7690             return -1;
7691         h->got_avcC = 1;
7692     }
7693
7694     buf_index=decode_nal_units(h, buf, buf_size);
7695     if(buf_index < 0)
7696         return -1;
7697
7698     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7699         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7700         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7701         return -1;
7702     }
7703
7704     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7705         Picture *out = s->current_picture_ptr;
7706         Picture *cur = s->current_picture_ptr;
7707         int i, pics, cross_idr, out_of_order, out_idx;
7708
7709         s->mb_y= 0;
7710
7711         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7712         s->current_picture_ptr->pict_type= s->pict_type;
7713
7714 #ifdef HAVE_VDPAU
7715         if (avctx->vdpau_acceleration) {
7716             ff_VDPAU_h264_set_reference_frames(h);
7717         }
7718 #endif
7719
7720         if(!s->dropable) {
7721             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7722             h->prev_poc_msb= h->poc_msb;
7723             h->prev_poc_lsb= h->poc_lsb;
7724         }
7725         h->prev_frame_num_offset= h->frame_num_offset;
7726         h->prev_frame_num= h->frame_num;
7727
7728 #ifdef HAVE_VDPAU
7729         if (avctx->vdpau_acceleration) {
7730             ff_VDPAU_h264_picture_complete(h, buf, buf_size);
7731         }
7732 #endif
7733
7734         /*
7735          * FIXME: Error handling code does not seem to support interlaced
7736          * when slices span multiple rows
7737          * The ff_er_add_slice calls don't work right for bottom
7738          * fields; they cause massive erroneous error concealing
7739          * Error marking covers both fields (top and bottom).
7740          * This causes a mismatched s->error_count
7741          * and a bad error table. Further, the error count goes to
7742          * INT_MAX when called for bottom field, because mb_y is
7743          * past end by one (callers fault) and resync_mb_y != 0
7744          * causes problems for the first MB line, too.
7745          */
7746 #ifdef HAVE_VDPAU
7747         if (!avctx->vdpau_acceleration)
7748 #endif
7749         if (!FIELD_PICTURE)
7750             ff_er_frame_end(s);
7751
7752         MPV_frame_end(s);
7753
7754         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7755             /* Wait for second field. */
7756             *data_size = 0;
7757
7758         } else {
7759             cur->repeat_pict = 0;
7760
7761             /* Signal interlacing information externally. */
7762             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7763             if(h->sps.pic_struct_present_flag){
7764                 switch (h->sei_pic_struct)
7765                 {
7766                 case SEI_PIC_STRUCT_FRAME:
7767                     cur->interlaced_frame = 0;
7768                     break;
7769                 case SEI_PIC_STRUCT_TOP_FIELD:
7770                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7771                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7772                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7773                     cur->interlaced_frame = 1;
7774                     break;
7775                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7776                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7777                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7778                     // From these hints, let the applications decide if they apply deinterlacing.
7779                     cur->repeat_pict = 1;
7780                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7781                     break;
7782                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7783                     // Force progressive here, as doubling interlaced frame is a bad idea.
7784                     cur->interlaced_frame = 0;
7785                     cur->repeat_pict = 2;
7786                     break;
7787                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7788                     cur->interlaced_frame = 0;
7789                     cur->repeat_pict = 4;
7790                     break;
7791                 }
7792             }else{
7793                 /* Derive interlacing flag from used decoding process. */
7794                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7795             }
7796
7797             if (cur->field_poc[0] != cur->field_poc[1]){
7798                 /* Derive top_field_first from field pocs. */
7799                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7800             }else{
7801                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7802                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7803                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7804                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7805                         cur->top_field_first = 1;
7806                     else
7807                         cur->top_field_first = 0;
7808                 }else{
7809                     /* Most likely progressive */
7810                     cur->top_field_first = 0;
7811                 }
7812             }
7813
7814         //FIXME do something with unavailable reference frames
7815
7816             /* Sort B-frames into display order */
7817
7818             if(h->sps.bitstream_restriction_flag
7819                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7820                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7821                 s->low_delay = 0;
7822             }
7823
7824             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7825                && !h->sps.bitstream_restriction_flag){
7826                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7827                 s->low_delay= 0;
7828             }
7829
7830             pics = 0;
7831             while(h->delayed_pic[pics]) pics++;
7832
7833             assert(pics <= MAX_DELAYED_PIC_COUNT);
7834
7835             h->delayed_pic[pics++] = cur;
7836             if(cur->reference == 0)
7837                 cur->reference = DELAYED_PIC_REF;
7838
7839             out = h->delayed_pic[0];
7840             out_idx = 0;
7841             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7842                 if(h->delayed_pic[i]->poc < out->poc){
7843                     out = h->delayed_pic[i];
7844                     out_idx = i;
7845                 }
7846             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7847
7848             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7849
7850             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7851                 { }
7852             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7853                || (s->low_delay &&
7854                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7855                  || cur->pict_type == FF_B_TYPE)))
7856             {
7857                 s->low_delay = 0;
7858                 s->avctx->has_b_frames++;
7859             }
7860
7861             if(out_of_order || pics > s->avctx->has_b_frames){
7862                 out->reference &= ~DELAYED_PIC_REF;
7863                 for(i=out_idx; h->delayed_pic[i]; i++)
7864                     h->delayed_pic[i] = h->delayed_pic[i+1];
7865             }
7866             if(!out_of_order && pics > s->avctx->has_b_frames){
7867                 *data_size = sizeof(AVFrame);
7868
7869                 h->outputed_poc = out->poc;
7870                 *pict= *(AVFrame*)out;
7871             }else{
7872                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7873             }
7874         }
7875     }
7876
7877     assert(pict->data[0] || !*data_size);
7878     ff_print_debug_info(s, pict);
7879 //printf("out %d\n", (int)pict->data[0]);
7880 #if 0 //?
7881
7882     /* Return the Picture timestamp as the frame number */
7883     /* we subtract 1 because it is added on utils.c     */
7884     avctx->frame_number = s->picture_number - 1;
7885 #endif
7886     return get_consumed_bytes(s, buf_index, buf_size);
7887 }
7888 #if 0
7889 static inline void fill_mb_avail(H264Context *h){
7890     MpegEncContext * const s = &h->s;
7891     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7892
7893     if(s->mb_y){
7894         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7895         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7896         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7897     }else{
7898         h->mb_avail[0]=
7899         h->mb_avail[1]=
7900         h->mb_avail[2]= 0;
7901     }
7902     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7903     h->mb_avail[4]= 1; //FIXME move out
7904     h->mb_avail[5]= 0; //FIXME move out
7905 }
7906 #endif
7907
7908 #ifdef TEST
7909 #undef printf
7910 #undef random
7911 #define COUNT 8000
7912 #define SIZE (COUNT*40)
7913 int main(void){
7914     int i;
7915     uint8_t temp[SIZE];
7916     PutBitContext pb;
7917     GetBitContext gb;
7918 //    int int_temp[10000];
7919     DSPContext dsp;
7920     AVCodecContext avctx;
7921
7922     dsputil_init(&dsp, &avctx);
7923
7924     init_put_bits(&pb, temp, SIZE);
7925     printf("testing unsigned exp golomb\n");
7926     for(i=0; i<COUNT; i++){
7927         START_TIMER
7928         set_ue_golomb(&pb, i);
7929         STOP_TIMER("set_ue_golomb");
7930     }
7931     flush_put_bits(&pb);
7932
7933     init_get_bits(&gb, temp, 8*SIZE);
7934     for(i=0; i<COUNT; i++){
7935         int j, s;
7936
7937         s= show_bits(&gb, 24);
7938
7939         START_TIMER
7940         j= get_ue_golomb(&gb);
7941         if(j != i){
7942             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7943 //            return -1;
7944         }
7945         STOP_TIMER("get_ue_golomb");
7946     }
7947
7948
7949     init_put_bits(&pb, temp, SIZE);
7950     printf("testing signed exp golomb\n");
7951     for(i=0; i<COUNT; i++){
7952         START_TIMER
7953         set_se_golomb(&pb, i - COUNT/2);
7954         STOP_TIMER("set_se_golomb");
7955     }
7956     flush_put_bits(&pb);
7957
7958     init_get_bits(&gb, temp, 8*SIZE);
7959     for(i=0; i<COUNT; i++){
7960         int j, s;
7961
7962         s= show_bits(&gb, 24);
7963
7964         START_TIMER
7965         j= get_se_golomb(&gb);
7966         if(j != i - COUNT/2){
7967             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7968 //            return -1;
7969         }
7970         STOP_TIMER("get_se_golomb");
7971     }
7972
7973 #if 0
7974     printf("testing 4x4 (I)DCT\n");
7975
7976     DCTELEM block[16];
7977     uint8_t src[16], ref[16];
7978     uint64_t error= 0, max_error=0;
7979
7980     for(i=0; i<COUNT; i++){
7981         int j;
7982 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7983         for(j=0; j<16; j++){
7984             ref[j]= random()%255;
7985             src[j]= random()%255;
7986         }
7987
7988         h264_diff_dct_c(block, src, ref, 4);
7989
7990         //normalize
7991         for(j=0; j<16; j++){
7992 //            printf("%d ", block[j]);
7993             block[j]= block[j]*4;
7994             if(j&1) block[j]= (block[j]*4 + 2)/5;
7995             if(j&4) block[j]= (block[j]*4 + 2)/5;
7996         }
7997 //        printf("\n");
7998
7999         s->dsp.h264_idct_add(ref, block, 4);
8000 /*        for(j=0; j<16; j++){
8001             printf("%d ", ref[j]);
8002         }
8003         printf("\n");*/
8004
8005         for(j=0; j<16; j++){
8006             int diff= FFABS(src[j] - ref[j]);
8007
8008             error+= diff*diff;
8009             max_error= FFMAX(max_error, diff);
8010         }
8011     }
8012     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8013     printf("testing quantizer\n");
8014     for(qp=0; qp<52; qp++){
8015         for(i=0; i<16; i++)
8016             src1_block[i]= src2_block[i]= random()%255;
8017
8018     }
8019     printf("Testing NAL layer\n");
8020
8021     uint8_t bitstream[COUNT];
8022     uint8_t nal[COUNT*2];
8023     H264Context h;
8024     memset(&h, 0, sizeof(H264Context));
8025
8026     for(i=0; i<COUNT; i++){
8027         int zeros= i;
8028         int nal_length;
8029         int consumed;
8030         int out_length;
8031         uint8_t *out;
8032         int j;
8033
8034         for(j=0; j<COUNT; j++){
8035             bitstream[j]= (random() % 255) + 1;
8036         }
8037
8038         for(j=0; j<zeros; j++){
8039             int pos= random() % COUNT;
8040             while(bitstream[pos] == 0){
8041                 pos++;
8042                 pos %= COUNT;
8043             }
8044             bitstream[pos]=0;
8045         }
8046
8047         START_TIMER
8048
8049         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8050         if(nal_length<0){
8051             printf("encoding failed\n");
8052             return -1;
8053         }
8054
8055         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8056
8057         STOP_TIMER("NAL")
8058
8059         if(out_length != COUNT){
8060             printf("incorrect length %d %d\n", out_length, COUNT);
8061             return -1;
8062         }
8063
8064         if(consumed != nal_length){
8065             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8066             return -1;
8067         }
8068
8069         if(memcmp(bitstream, out, COUNT)){
8070             printf("mismatch\n");
8071             return -1;
8072         }
8073     }
8074 #endif
8075
8076     printf("Testing RBSP\n");
8077
8078
8079     return 0;
8080 }
8081 #endif /* TEST */
8082
8083
8084 static av_cold int decode_end(AVCodecContext *avctx)
8085 {
8086     H264Context *h = avctx->priv_data;
8087     MpegEncContext *s = &h->s;
8088     int i;
8089
8090     av_freep(&h->rbsp_buffer[0]);
8091     av_freep(&h->rbsp_buffer[1]);
8092     free_tables(h); //FIXME cleanup init stuff perhaps
8093
8094     for(i = 0; i < MAX_SPS_COUNT; i++)
8095         av_freep(h->sps_buffers + i);
8096
8097     for(i = 0; i < MAX_PPS_COUNT; i++)
8098         av_freep(h->pps_buffers + i);
8099
8100     MPV_common_end(s);
8101
8102 //    memset(h, 0, sizeof(H264Context));
8103
8104     return 0;
8105 }
8106
8107
8108 AVCodec h264_decoder = {
8109     "h264",
8110     CODEC_TYPE_VIDEO,
8111     CODEC_ID_H264,
8112     sizeof(H264Context),
8113     decode_init,
8114     NULL,
8115     decode_end,
8116     decode_frame,
8117     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8118     .flush= flush_dpb,
8119     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8120 };
8121
8122 #ifdef HAVE_VDPAU
8123 static av_cold int h264_vdpau_decode_init(AVCodecContext *avctx){
8124     if( avctx->thread_count > 1)
8125         return -1;
8126     if( !(avctx->slice_flags & SLICE_FLAG_CODED_ORDER) )
8127         return -1;
8128     if( !(avctx->slice_flags & SLICE_FLAG_ALLOW_FIELD) ){
8129         dprintf(avctx, "h264.c: VDPAU decoder does not set SLICE_FLAG_ALLOW_FIELD\n");
8130     }
8131     decode_init(avctx);
8132
8133     avctx->vdpau_acceleration = 1;
8134
8135     return 0;
8136 }
8137
8138 AVCodec h264_vdpau_decoder = {
8139     "h264_vdpau",
8140     CODEC_TYPE_VIDEO,
8141     CODEC_ID_H264_VDPAU,
8142     sizeof(H264Context),
8143     h264_vdpau_decode_init,
8144     NULL,
8145     decode_end,
8146     decode_frame,
8147     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8148     .flush= flush_dpb,
8149     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8150 };
8151 #endif
8152
8153 #include "svq3.c"