libavcodec/snow.c

   1 /*
   2  * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "avcodec.h"
  22 #include "dsputil.h"
  23 #include "snow.h"
  24
  25 #include "rangecoder.h"
  26 #include "mathops.h"
  27
  28 #include "mpegvideo.h"
  29
  30 #undef NDEBUG
  31 #include <assert.h>
  32
  33 static const int8_t quant3[256]={
  34  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  41  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  42 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  43 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  44 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  45 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  46 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  47 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  48 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  49 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
  50 };
  51 static const int8_t quant3b[256]={
  52  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  58  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  59  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  60 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  61 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  62 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  63 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  64 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  65 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  66 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  67 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  68 };
  69 static const int8_t quant3bA[256]={
  70  0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  71  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  72  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  73  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  74  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  75  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  76  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  77  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  78  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  79  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  80  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  81  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  82  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  83  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  84  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  85  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  86 };
  87 static const int8_t quant5[256]={
  88  0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  89  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  90  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  91  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  92  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  95  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  96 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  97 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  98 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  99 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 100 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 101 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 102 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 103 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
 104 };
 105 static const int8_t quant7[256]={
 106  0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 107  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 108  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
 109  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 110  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 111  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 112  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 113  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 114 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 115 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 116 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 117 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 118 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 119 -3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
 120 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 121 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
 122 };
 123 static const int8_t quant9[256]={
 124  0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 125  3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 126  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 127  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 128  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 129  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 130  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 131  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 132 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 133 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 134 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 135 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 136 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 137 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 138 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
 139 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
 140 };
 141 static const int8_t quant11[256]={
 142  0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
 143  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 144  4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 145  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 146  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 147  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 148  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 149  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 150 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 151 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 152 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 153 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 154 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 155 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
 156 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 157 -4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
 158 };
 159 static const int8_t quant13[256]={
 160  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
 161  4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 162  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 163  5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 164  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 165  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 166  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 167  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 168 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 169 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 170 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 171 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 172 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
 173 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 174 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 175 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 176 };
 177
 178 #if 0 //64*cubic
 179 static const uint8_t obmc32[1024]={
 180   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 181   0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,
 182   0,  0,  0,  4,  4,  4,  4,  8,  8, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 16, 12, 12, 12,  8,  8,  4,  4,  4,  4,  0,  0,  0,
 183   0,  0,  4,  4,  8,  8, 12, 16, 16, 20, 24, 24, 28, 28, 32, 32, 32, 32, 28, 28, 24, 24, 20, 16, 16, 12,  8,  8,  4,  4,  0,  0,
 184   0,  0,  4,  8,  8, 12, 16, 24, 28, 32, 36, 40, 44, 48, 48, 48, 48, 48, 48, 44, 40, 36, 32, 28, 24, 16, 12,  8,  8,  4,  0,  0,
 185   0,  4,  4,  8, 12, 20, 24, 32, 40, 44, 52, 56, 60, 64, 68, 72, 72, 68, 64, 60, 56, 52, 44, 40, 32, 24, 20, 12,  8,  4,  4,  0,
 186   0,  4,  4, 12, 16, 24, 32, 40, 52, 60, 68, 76, 80, 88, 88, 92, 92, 88, 88, 80, 76, 68, 60, 52, 40, 32, 24, 16, 12,  4,  4,  0,
 187   0,  4,  8, 16, 24, 32, 40, 52, 64, 76, 84, 92,100,108,112,116,116,112,108,100, 92, 84, 76, 64, 52, 40, 32, 24, 16,  8,  4,  0,
 188   0,  4,  8, 16, 28, 40, 52, 64, 76, 88,100,112,124,132,136,140,140,136,132,124,112,100, 88, 76, 64, 52, 40, 28, 16,  8,  4,  0,
 189   0,  4, 12, 20, 32, 44, 60, 76, 88,104,120,132,144,152,160,164,164,160,152,144,132,120,104, 88, 76, 60, 44, 32, 20, 12,  4,  0,
 190   0,  4, 12, 24, 36, 48, 68, 84,100,120,136,152,164,176,180,184,184,180,176,164,152,136,120,100, 84, 68, 48, 36, 24, 12,  4,  0,
 191   0,  4, 12, 24, 40, 56, 76, 92,112,132,152,168,180,192,204,208,208,204,192,180,168,152,132,112, 92, 76, 56, 40, 24, 12,  4,  0,
 192   0,  4, 16, 28, 44, 60, 80,100,124,144,164,180,196,208,220,224,224,220,208,196,180,164,144,124,100, 80, 60, 44, 28, 16,  4,  0,
 193   0,  8, 16, 28, 48, 64, 88,108,132,152,176,192,208,224,232,240,240,232,224,208,192,176,152,132,108, 88, 64, 48, 28, 16,  8,  0,
 194   0,  4, 16, 32, 48, 68, 88,112,136,160,180,204,220,232,244,248,248,244,232,220,204,180,160,136,112, 88, 68, 48, 32, 16,  4,  0,
 195   1,  8, 16, 32, 48, 72, 92,116,140,164,184,208,224,240,248,255,255,248,240,224,208,184,164,140,116, 92, 72, 48, 32, 16,  8,  1,
 196   1,  8, 16, 32, 48, 72, 92,116,140,164,184,208,224,240,248,255,255,248,240,224,208,184,164,140,116, 92, 72, 48, 32, 16,  8,  1,
 197   0,  4, 16, 32, 48, 68, 88,112,136,160,180,204,220,232,244,248,248,244,232,220,204,180,160,136,112, 88, 68, 48, 32, 16,  4,  0,
 198   0,  8, 16, 28, 48, 64, 88,108,132,152,176,192,208,224,232,240,240,232,224,208,192,176,152,132,108, 88, 64, 48, 28, 16,  8,  0,
 199   0,  4, 16, 28, 44, 60, 80,100,124,144,164,180,196,208,220,224,224,220,208,196,180,164,144,124,100, 80, 60, 44, 28, 16,  4,  0,
 200   0,  4, 12, 24, 40, 56, 76, 92,112,132,152,168,180,192,204,208,208,204,192,180,168,152,132,112, 92, 76, 56, 40, 24, 12,  4,  0,
 201   0,  4, 12, 24, 36, 48, 68, 84,100,120,136,152,164,176,180,184,184,180,176,164,152,136,120,100, 84, 68, 48, 36, 24, 12,  4,  0,
 202   0,  4, 12, 20, 32, 44, 60, 76, 88,104,120,132,144,152,160,164,164,160,152,144,132,120,104, 88, 76, 60, 44, 32, 20, 12,  4,  0,
 203   0,  4,  8, 16, 28, 40, 52, 64, 76, 88,100,112,124,132,136,140,140,136,132,124,112,100, 88, 76, 64, 52, 40, 28, 16,  8,  4,  0,
 204   0,  4,  8, 16, 24, 32, 40, 52, 64, 76, 84, 92,100,108,112,116,116,112,108,100, 92, 84, 76, 64, 52, 40, 32, 24, 16,  8,  4,  0,
 205   0,  4,  4, 12, 16, 24, 32, 40, 52, 60, 68, 76, 80, 88, 88, 92, 92, 88, 88, 80, 76, 68, 60, 52, 40, 32, 24, 16, 12,  4,  4,  0,
 206   0,  4,  4,  8, 12, 20, 24, 32, 40, 44, 52, 56, 60, 64, 68, 72, 72, 68, 64, 60, 56, 52, 44, 40, 32, 24, 20, 12,  8,  4,  4,  0,
 207   0,  0,  4,  8,  8, 12, 16, 24, 28, 32, 36, 40, 44, 48, 48, 48, 48, 48, 48, 44, 40, 36, 32, 28, 24, 16, 12,  8,  8,  4,  0,  0,
 208   0,  0,  4,  4,  8,  8, 12, 16, 16, 20, 24, 24, 28, 28, 32, 32, 32, 32, 28, 28, 24, 24, 20, 16, 16, 12,  8,  8,  4,  4,  0,  0,
 209   0,  0,  0,  4,  4,  4,  4,  8,  8, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 16, 12, 12, 12,  8,  8,  4,  4,  4,  4,  0,  0,  0,
 210   0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,
 211   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 212 //error:0.000022
 213 };
 214 static const uint8_t obmc16[256]={
 215   0,  0,  0,  0,  0,  0,  4,  4,  4,  4,  0,  0,  0,  0,  0,  0,
 216   0,  4,  4,  8, 16, 20, 20, 24, 24, 20, 20, 16,  8,  4,  4,  0,
 217   0,  4, 16, 24, 36, 44, 52, 60, 60, 52, 44, 36, 24, 16,  4,  0,
 218   0,  8, 24, 44, 60, 80, 96,104,104, 96, 80, 60, 44, 24,  8,  0,
 219   0, 16, 36, 60, 92,116,136,152,152,136,116, 92, 60, 36, 16,  0,
 220   0, 20, 44, 80,116,152,180,196,196,180,152,116, 80, 44, 20,  0,
 221   4, 20, 52, 96,136,180,212,228,228,212,180,136, 96, 52, 20,  4,
 222   4, 24, 60,104,152,196,228,248,248,228,196,152,104, 60, 24,  4,
 223   4, 24, 60,104,152,196,228,248,248,228,196,152,104, 60, 24,  4,
 224   4, 20, 52, 96,136,180,212,228,228,212,180,136, 96, 52, 20,  4,
 225   0, 20, 44, 80,116,152,180,196,196,180,152,116, 80, 44, 20,  0,
 226   0, 16, 36, 60, 92,116,136,152,152,136,116, 92, 60, 36, 16,  0,
 227   0,  8, 24, 44, 60, 80, 96,104,104, 96, 80, 60, 44, 24,  8,  0,
 228   0,  4, 16, 24, 36, 44, 52, 60, 60, 52, 44, 36, 24, 16,  4,  0,
 229   0,  4,  4,  8, 16, 20, 20, 24, 24, 20, 20, 16,  8,  4,  4,  0,
 230   0,  0,  0,  0,  0,  0,  4,  4,  4,  4,  0,  0,  0,  0,  0,  0,
 231 //error:0.000033
 232 };
 233 #elif 1 // 64*linear
 234 static const uint8_t obmc32[1024]={
 235   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 236   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 237   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 238   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 239   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 240   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 241   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 242   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 243   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 244   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 245   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 246   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 247   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 248   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 249   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 250   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 251   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 252   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 253   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 254   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 255   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 256   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 257   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 258   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 259   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 260   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 261   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 262   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 263   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 264   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 265   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 266   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 267  //error:0.000020
 268 };
 269 static const uint8_t obmc16[256]={
 270   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 271   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 272   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 273   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 274   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 275  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 276  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 277  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 278  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 279  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 280  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 281   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 282   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 283   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 284   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 285   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 286 //error:0.000015
 287 };
 288 #else //64*cos
 289 static const uint8_t obmc32[1024]={
 290   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 291   0,  0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  4,  4,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,  0,
 292   0,  0,  0,  4,  4,  4,  4,  8,  8, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 12, 12, 12, 12,  8,  8,  4,  4,  4,  4,  0,  0,  0,
 293   0,  0,  4,  4,  4,  8,  8, 12, 16, 20, 20, 24, 28, 28, 28, 28, 28, 28, 28, 28, 24, 20, 20, 16, 12,  8,  8,  4,  4,  4,  0,  0,
 294   0,  0,  4,  4,  8, 12, 16, 20, 24, 28, 36, 40, 44, 44, 48, 48, 48, 48, 44, 44, 40, 36, 28, 24, 20, 16, 12,  8,  4,  4,  0,  0,
 295   0,  0,  4,  8, 12, 20, 24, 32, 36, 44, 48, 56, 60, 64, 68, 68, 68, 68, 64, 60, 56, 48, 44, 36, 32, 24, 20, 12,  8,  4,  0,  0,
 296   0,  4,  4,  8, 16, 24, 32, 40, 48, 60, 68, 76, 80, 84, 88, 92, 92, 88, 84, 80, 76, 68, 60, 48, 40, 32, 24, 16,  8,  4,  4,  0,
 297   0,  4,  8, 12, 20, 32, 40, 52, 64, 76, 84, 96,104,108,112,116,116,112,108,104, 96, 84, 76, 64, 52, 40, 32, 20, 12,  8,  4,  0,
 298   0,  4,  8, 16, 24, 36, 48, 64, 76, 92,104,116,124,132,136,140,140,136,132,124,116,104, 92, 76, 64, 48, 36, 24, 16,  8,  4,  0,
 299   0,  4, 12, 20, 28, 44, 60, 76, 92,104,120,136,148,156,160,164,164,160,156,148,136,120,104, 92, 76, 60, 44, 28, 20, 12,  4,  0,
 300   0,  4, 12, 20, 36, 48, 68, 84,104,120,140,152,168,176,184,188,188,184,176,168,152,140,120,104, 84, 68, 48, 36, 20, 12,  4,  0,
 301   0,  4, 12, 24, 36, 56, 76, 96,116,136,152,172,184,196,204,208,208,204,196,184,172,152,136,116, 96, 76, 56, 36, 24, 12,  4,  0,
 302   0,  4, 12, 24, 44, 60, 80,104,124,148,168,184,200,212,224,228,228,224,212,200,184,168,148,124,104, 80, 60, 44, 24, 12,  4,  0,
 303   0,  4, 12, 28, 44, 64, 84,108,132,156,176,196,212,228,236,240,240,236,228,212,196,176,156,132,108, 84, 64, 44, 28, 12,  4,  0,
 304   0,  4, 16, 28, 48, 68, 88,112,136,160,184,204,224,236,244,252,252,244,236,224,204,184,160,136,112, 88, 68, 48, 28, 16,  4,  0,
 305   1,  4, 16, 28, 48, 68, 92,116,140,164,188,208,228,240,252,255,255,252,240,228,208,188,164,140,116, 92, 68, 48, 28, 16,  4,  1,
 306   1,  4, 16, 28, 48, 68, 92,116,140,164,188,208,228,240,252,255,255,252,240,228,208,188,164,140,116, 92, 68, 48, 28, 16,  4,  1,
 307   0,  4, 16, 28, 48, 68, 88,112,136,160,184,204,224,236,244,252,252,244,236,224,204,184,160,136,112, 88, 68, 48, 28, 16,  4,  0,
 308   0,  4, 12, 28, 44, 64, 84,108,132,156,176,196,212,228,236,240,240,236,228,212,196,176,156,132,108, 84, 64, 44, 28, 12,  4,  0,
 309   0,  4, 12, 24, 44, 60, 80,104,124,148,168,184,200,212,224,228,228,224,212,200,184,168,148,124,104, 80, 60, 44, 24, 12,  4,  0,
 310   0,  4, 12, 24, 36, 56, 76, 96,116,136,152,172,184,196,204,208,208,204,196,184,172,152,136,116, 96, 76, 56, 36, 24, 12,  4,  0,
 311   0,  4, 12, 20, 36, 48, 68, 84,104,120,140,152,168,176,184,188,188,184,176,168,152,140,120,104, 84, 68, 48, 36, 20, 12,  4,  0,
 312   0,  4, 12, 20, 28, 44, 60, 76, 92,104,120,136,148,156,160,164,164,160,156,148,136,120,104, 92, 76, 60, 44, 28, 20, 12,  4,  0,
 313   0,  4,  8, 16, 24, 36, 48, 64, 76, 92,104,116,124,132,136,140,140,136,132,124,116,104, 92, 76, 64, 48, 36, 24, 16,  8,  4,  0,
 314   0,  4,  8, 12, 20, 32, 40, 52, 64, 76, 84, 96,104,108,112,116,116,112,108,104, 96, 84, 76, 64, 52, 40, 32, 20, 12,  8,  4,  0,
 315   0,  4,  4,  8, 16, 24, 32, 40, 48, 60, 68, 76, 80, 84, 88, 92, 92, 88, 84, 80, 76, 68, 60, 48, 40, 32, 24, 16,  8,  4,  4,  0,
 316   0,  0,  4,  8, 12, 20, 24, 32, 36, 44, 48, 56, 60, 64, 68, 68, 68, 68, 64, 60, 56, 48, 44, 36, 32, 24, 20, 12,  8,  4,  0,  0,
 317   0,  0,  4,  4,  8, 12, 16, 20, 24, 28, 36, 40, 44, 44, 48, 48, 48, 48, 44, 44, 40, 36, 28, 24, 20, 16, 12,  8,  4,  4,  0,  0,
 318   0,  0,  4,  4,  4,  8,  8, 12, 16, 20, 20, 24, 28, 28, 28, 28, 28, 28, 28, 28, 24, 20, 20, 16, 12,  8,  8,  4,  4,  4,  0,  0,
 319   0,  0,  0,  4,  4,  4,  4,  8,  8, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 12, 12, 12, 12,  8,  8,  4,  4,  4,  4,  0,  0,  0,
 320   0,  0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  4,  4,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,  0,
 321   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 322 //error:0.000022
 323 };
 324 static const uint8_t obmc16[256]={
 325   0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,
 326   0,  0,  4,  8, 12, 16, 20, 20, 20, 20, 16, 12,  8,  4,  0,  0,
 327   0,  4, 12, 24, 32, 44, 52, 56, 56, 52, 44, 32, 24, 12,  4,  0,
 328   0,  8, 24, 40, 60, 80, 96,104,104, 96, 80, 60, 40, 24,  8,  0,
 329   0, 12, 32, 64, 92,120,140,152,152,140,120, 92, 64, 32, 12,  0,
 330   4, 16, 44, 80,120,156,184,196,196,184,156,120, 80, 44, 16,  4,
 331   4, 20, 52, 96,140,184,216,232,232,216,184,140, 96, 52, 20,  4,
 332   0, 20, 56,104,152,196,232,252,252,232,196,152,104, 56, 20,  0,
 333   0, 20, 56,104,152,196,232,252,252,232,196,152,104, 56, 20,  0,
 334   4, 20, 52, 96,140,184,216,232,232,216,184,140, 96, 52, 20,  4,
 335   4, 16, 44, 80,120,156,184,196,196,184,156,120, 80, 44, 16,  4,
 336   0, 12, 32, 64, 92,120,140,152,152,140,120, 92, 64, 32, 12,  0,
 337   0,  8, 24, 40, 60, 80, 96,104,104, 96, 80, 60, 40, 24,  8,  0,
 338   0,  4, 12, 24, 32, 44, 52, 56, 56, 52, 44, 32, 24, 12,  4,  0,
 339   0,  0,  4,  8, 12, 16, 20, 20, 20, 20, 16, 12,  8,  4,  0,  0,
 340   0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,
 341 //error:0.000022
 342 };
 343 #endif /* 0 */
 344
 345 //linear *64
 346 static const uint8_t obmc8[64]={
 347   4, 12, 20, 28, 28, 20, 12,  4,
 348  12, 36, 60, 84, 84, 60, 36, 12,
 349  20, 60,100,140,140,100, 60, 20,
 350  28, 84,140,196,196,140, 84, 28,
 351  28, 84,140,196,196,140, 84, 28,
 352  20, 60,100,140,140,100, 60, 20,
 353  12, 36, 60, 84, 84, 60, 36, 12,
 354   4, 12, 20, 28, 28, 20, 12,  4,
 355 //error:0.000000
 356 };
 357
 358 //linear *64
 359 static const uint8_t obmc4[16]={
 360  16, 48, 48, 16,
 361  48,144,144, 48,
 362  48,144,144, 48,
 363  16, 48, 48, 16,
 364 //error:0.000000
 365 };
 366
 367 static const uint8_t * const obmc_tab[4]={
 368     obmc32, obmc16, obmc8, obmc4
 369 };
 370
 371 static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
 372
 373 typedef struct BlockNode{
 374     int16_t mx;
 375     int16_t my;
 376     uint8_t ref;
 377     uint8_t color[3];
 378     uint8_t type;
 379 //#define TYPE_SPLIT    1
 380 #define BLOCK_INTRA   1
 381 #define BLOCK_OPT     2
 382 //#define TYPE_NOCOLOR  4
 383     uint8_t level; //FIXME merge into type?
 384 }BlockNode;
 385
 386 static const BlockNode null_block= { //FIXME add border maybe
 387     .color= {128,128,128},
 388     .mx= 0,
 389     .my= 0,
 390     .ref= 0,
 391     .type= 0,
 392     .level= 0,
 393 };
 394
 395 #define LOG2_MB_SIZE 4
 396 #define MB_SIZE (1<<LOG2_MB_SIZE)
 397 #define ENCODER_EXTRA_BITS 4
 398 #define HTAPS_MAX 8
 399
 400 typedef struct x_and_coeff{
 401     int16_t x;
 402     uint16_t coeff;
 403 } x_and_coeff;
 404
 405 typedef struct SubBand{
 406     int level;
 407     int stride;
 408     int width;
 409     int height;
 410     int qlog;        ///< log(qscale)/log[2^(1/6)]
 411     DWTELEM *buf;
 412     IDWTELEM *ibuf;
 413     int buf_x_offset;
 414     int buf_y_offset;
 415     int stride_line; ///< Stride measured in lines, not pixels.
 416     x_and_coeff * x_coeff;
 417     struct SubBand *parent;
 418     uint8_t state[/*7*2*/ 7 + 512][32];
 419 }SubBand;
 420
 421 typedef struct Plane{
 422     int width;
 423     int height;
 424     SubBand band[MAX_DECOMPOSITIONS][4];
 425
 426     int htaps;
 427     int8_t hcoeff[HTAPS_MAX/2];
 428     int diag_mc;
 429     int fast_mc;
 430
 431     int last_htaps;
 432     int8_t last_hcoeff[HTAPS_MAX/2];
 433     int last_diag_mc;
 434 }Plane;
 435
 436 typedef struct SnowContext{
 437 //    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to eventually make the motion estimation independent of MpegEncContext, so this will be removed then (FIXME/XXX)
 438
 439     AVCodecContext *avctx;
 440     RangeCoder c;
 441     DSPContext dsp;
 442     AVFrame new_picture;
 443     AVFrame input_picture;              ///< new_picture with the internal linesizes
 444     AVFrame current_picture;
 445     AVFrame last_picture[MAX_REF_FRAMES];
 446     uint8_t *halfpel_plane[MAX_REF_FRAMES][4][4];
 447     AVFrame mconly_picture;
 448 //     uint8_t q_context[16];
 449     uint8_t header_state[32];
 450     uint8_t block_state[128 + 32*128];
 451     int keyframe;
 452     int always_reset;
 453     int version;
 454     int spatial_decomposition_type;
 455     int last_spatial_decomposition_type;
 456     int temporal_decomposition_type;
 457     int spatial_decomposition_count;
 458     int last_spatial_decomposition_count;
 459     int temporal_decomposition_count;
 460     int max_ref_frames;
 461     int ref_frames;
 462     int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
 463     uint32_t *ref_scores[MAX_REF_FRAMES];
 464     DWTELEM *spatial_dwt_buffer;
 465     IDWTELEM *spatial_idwt_buffer;
 466     int colorspace_type;
 467     int chroma_h_shift;
 468     int chroma_v_shift;
 469     int spatial_scalability;
 470     int qlog;
 471     int last_qlog;
 472     int lambda;
 473     int lambda2;
 474     int pass1_rc;
 475     int mv_scale;
 476     int last_mv_scale;
 477     int qbias;
 478     int last_qbias;
 479 #define QBIAS_SHIFT 3
 480     int b_width;
 481     int b_height;
 482     int block_max_depth;
 483     int last_block_max_depth;
 484     Plane plane[MAX_PLANES];
 485     BlockNode *block;
 486 #define ME_CACHE_SIZE 1024
 487     int me_cache[ME_CACHE_SIZE];
 488     int me_cache_generation;
 489     slice_buffer sb;
 490
 491     MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to eventually make the motion estimation independent of MpegEncContext, so this will be removed then (FIXME/XXX)
 492
 493     uint8_t *scratchbuf;
 494 }SnowContext;
 495
 496 typedef struct {
 497     IDWTELEM *b0;
 498     IDWTELEM *b1;
 499     IDWTELEM *b2;
 500     IDWTELEM *b3;
 501     int y;
 502 } DWTCompose;
 503
 504 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
 505 //#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
 506
 507 static void iterative_me(SnowContext *s);
 508
 509 static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, IDWTELEM * base_buffer)
 510 {
 511     int i;
 512
 513     buf->base_buffer = base_buffer;
 514     buf->line_count = line_count;
 515     buf->line_width = line_width;
 516     buf->data_count = max_allocated_lines;
 517     buf->line = av_mallocz (sizeof(IDWTELEM *) * line_count);
 518     buf->data_stack = av_malloc (sizeof(IDWTELEM *) * max_allocated_lines);
 519
 520     for(i = 0; i < max_allocated_lines; i++){
 521         buf->data_stack[i] = av_malloc (sizeof(IDWTELEM) * line_width);
 522     }
 523
 524     buf->data_stack_top = max_allocated_lines - 1;
 525 }
 526
 527 static IDWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
 528 {
 529     IDWTELEM * buffer;
 530
 531     assert(buf->data_stack_top >= 0);
 532 //  assert(!buf->line[line]);
 533     if (buf->line[line])
 534         return buf->line[line];
 535
 536     buffer = buf->data_stack[buf->data_stack_top];
 537     buf->data_stack_top--;
 538     buf->line[line] = buffer;
 539
 540     return buffer;
 541 }
 542
 543 static void slice_buffer_release(slice_buffer * buf, int line)
 544 {
 545     IDWTELEM * buffer;
 546
 547     assert(line >= 0 && line < buf->line_count);
 548     assert(buf->line[line]);
 549
 550     buffer = buf->line[line];
 551     buf->data_stack_top++;
 552     buf->data_stack[buf->data_stack_top] = buffer;
 553     buf->line[line] = NULL;
 554 }
 555
 556 static void slice_buffer_flush(slice_buffer * buf)
 557 {
 558     int i;
 559     for(i = 0; i < buf->line_count; i++){
 560         if (buf->line[i])
 561             slice_buffer_release(buf, i);
 562     }
 563 }
 564
 565 static void slice_buffer_destroy(slice_buffer * buf)
 566 {
 567     int i;
 568     slice_buffer_flush(buf);
 569
 570     for(i = buf->data_count - 1; i >= 0; i--){
 571         av_freep(&buf->data_stack[i]);
 572     }
 573     av_freep(&buf->data_stack);
 574     av_freep(&buf->line);
 575 }
 576
 577 #ifdef __sgi
 578 // Avoid a name clash on SGI IRIX
 579 #undef qexp
 580 #endif
 581 #define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
 582 static uint8_t qexp[QROOT];
 583
 584 static inline int mirror(int v, int m){
 585     while((unsigned)v > (unsigned)m){
 586         v=-v;
 587         if(v<0) v+= 2*m;
 588     }
 589     return v;
 590 }
 591
 592 static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
 593     int i;
 594
 595     if(v){
 596         const int a= FFABS(v);
 597         const int e= av_log2(a);
 598 #if 1
 599         const int el= FFMIN(e, 10);
 600         put_rac(c, state+0, 0);
 601
 602         for(i=0; i<el; i++){
 603             put_rac(c, state+1+i, 1);  //1..10
 604         }
 605         for(; i<e; i++){
 606             put_rac(c, state+1+9, 1);  //1..10
 607         }
 608         put_rac(c, state+1+FFMIN(i,9), 0);
 609
 610         for(i=e-1; i>=el; i--){
 611             put_rac(c, state+22+9, (a>>i)&1); //22..31
 612         }
 613         for(; i>=0; i--){
 614             put_rac(c, state+22+i, (a>>i)&1); //22..31
 615         }
 616
 617         if(is_signed)
 618             put_rac(c, state+11 + el, v < 0); //11..21
 619 #else
 620
 621         put_rac(c, state+0, 0);
 622         if(e<=9){
 623             for(i=0; i<e; i++){
 624                 put_rac(c, state+1+i, 1);  //1..10
 625             }
 626             put_rac(c, state+1+i, 0);
 627
 628             for(i=e-1; i>=0; i--){
 629                 put_rac(c, state+22+i, (a>>i)&1); //22..31
 630             }
 631
 632             if(is_signed)
 633                 put_rac(c, state+11 + e, v < 0); //11..21
 634         }else{
 635             for(i=0; i<e; i++){
 636                 put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
 637             }
 638             put_rac(c, state+1+9, 0);
 639
 640             for(i=e-1; i>=0; i--){
 641                 put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
 642             }
 643
 644             if(is_signed)
 645                 put_rac(c, state+11 + 10, v < 0); //11..21
 646         }
 647 #endif /* 1 */
 648     }else{
 649         put_rac(c, state+0, 1);
 650     }
 651 }
 652
 653 static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
 654     if(get_rac(c, state+0))
 655         return 0;
 656     else{
 657         int i, e, a;
 658         e= 0;
 659         while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
 660             e++;
 661         }
 662
 663         a= 1;
 664         for(i=e-1; i>=0; i--){
 665             a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
 666         }
 667
 668         e= -(is_signed && get_rac(c, state+11 + FFMIN(e,10))); //11..21
 669         return (a^e)-e;
 670     }
 671 }
 672
 673 static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
 674     int i;
 675     int r= log2>=0 ? 1<<log2 : 1;
 676
 677     assert(v>=0);
 678     assert(log2>=-4);
 679
 680     while(v >= r){
 681         put_rac(c, state+4+log2, 1);
 682         v -= r;
 683         log2++;
 684         if(log2>0) r+=r;
 685     }
 686     put_rac(c, state+4+log2, 0);
 687
 688     for(i=log2-1; i>=0; i--){
 689         put_rac(c, state+31-i, (v>>i)&1);
 690     }
 691 }
 692
 693 static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
 694     int i;
 695     int r= log2>=0 ? 1<<log2 : 1;
 696     int v=0;
 697
 698     assert(log2>=-4);
 699
 700     while(get_rac(c, state+4+log2)){
 701         v+= r;
 702         log2++;
 703         if(log2>0) r+=r;
 704     }
 705
 706     for(i=log2-1; i>=0; i--){
 707         v+= get_rac(c, state+31-i)<<i;
 708     }
 709
 710     return v;
 711 }
 712
 713 static av_always_inline void
 714 lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
 715      int dst_step, int src_step, int ref_step,
 716      int width, int mul, int add, int shift,
 717      int highpass, int inverse){
 718     const int mirror_left= !highpass;
 719     const int mirror_right= (width&1) ^ highpass;
 720     const int w= (width>>1) - 1 + (highpass & width);
 721     int i;
 722
 723 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 724     if(mirror_left){
 725         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 726         dst += dst_step;
 727         src += src_step;
 728     }
 729
 730     for(i=0; i<w; i++){
 731         dst[i*dst_step] =
 732             LIFT(src[i*src_step],
 733                  ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift),
 734                  inverse);
 735     }
 736
 737     if(mirror_right){
 738         dst[w*dst_step] =
 739             LIFT(src[w*src_step],
 740                  ((mul*2*ref[w*ref_step]+add)>>shift),
 741                  inverse);
 742     }
 743 }
 744
 745 static av_always_inline void
 746 inv_lift(IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref,
 747          int dst_step, int src_step, int ref_step,
 748          int width, int mul, int add, int shift,
 749          int highpass, int inverse){
 750     const int mirror_left= !highpass;
 751     const int mirror_right= (width&1) ^ highpass;
 752     const int w= (width>>1) - 1 + (highpass & width);
 753     int i;
 754
 755 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 756     if(mirror_left){
 757         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 758         dst += dst_step;
 759         src += src_step;
 760     }
 761
 762     for(i=0; i<w; i++){
 763         dst[i*dst_step] =
 764             LIFT(src[i*src_step],
 765                  ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift),
 766                  inverse);
 767     }
 768
 769     if(mirror_right){
 770         dst[w*dst_step] =
 771             LIFT(src[w*src_step],
 772                  ((mul*2*ref[w*ref_step]+add)>>shift),
 773                  inverse);
 774     }
 775 }
 776
 777 #ifndef liftS
 778 static av_always_inline void
 779 liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
 780       int dst_step, int src_step, int ref_step,
 781       int width, int mul, int add, int shift,
 782       int highpass, int inverse){
 783     const int mirror_left= !highpass;
 784     const int mirror_right= (width&1) ^ highpass;
 785     const int w= (width>>1) - 1 + (highpass & width);
 786     int i;
 787
 788     assert(shift == 4);
 789 #define LIFTS(src, ref, inv) \
 790         ((inv) ? \
 791             (src) + (((ref) + 4*(src))>>shift): \
 792             -((-16*(src) + (ref) + add/4 + 1 + (5<<25))/(5*4) - (1<<23)))
 793     if(mirror_left){
 794         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 795         dst += dst_step;
 796         src += src_step;
 797     }
 798
 799     for(i=0; i<w; i++){
 800         dst[i*dst_step] =
 801             LIFTS(src[i*src_step],
 802                   mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add,
 803                   inverse);
 804     }
 805
 806     if(mirror_right){
 807         dst[w*dst_step] =
 808             LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 809     }
 810 }
 811 static av_always_inline void
 812 inv_liftS(IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref,
 813           int dst_step, int src_step, int ref_step,
 814           int width, int mul, int add, int shift,
 815           int highpass, int inverse){
 816     const int mirror_left= !highpass;
 817     const int mirror_right= (width&1) ^ highpass;
 818     const int w= (width>>1) - 1 + (highpass & width);
 819     int i;
 820
 821     assert(shift == 4);
 822 #define LIFTS(src, ref, inv) \
 823     ((inv) ? \
 824         (src) + (((ref) + 4*(src))>>shift): \
 825         -((-16*(src) + (ref) + add/4 + 1 + (5<<25))/(5*4) - (1<<23)))
 826     if(mirror_left){
 827         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 828         dst += dst_step;
 829         src += src_step;
 830     }
 831
 832     for(i=0; i<w; i++){
 833         dst[i*dst_step] =
 834             LIFTS(src[i*src_step],
 835                   mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add,
 836                   inverse);
 837     }
 838
 839     if(mirror_right){
 840         dst[w*dst_step] =
 841             LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 842     }
 843 }
 844 #endif /* ! liftS */
 845
 846 static void horizontal_decompose53i(DWTELEM *b, int width){
 847     DWTELEM temp[width];
 848     const int width2= width>>1;
 849     int x;
 850     const int w2= (width+1)>>1;
 851
 852     for(x=0; x<width2; x++){
 853         temp[x   ]= b[2*x    ];
 854         temp[x+w2]= b[2*x + 1];
 855     }
 856     if(width&1)
 857         temp[x   ]= b[2*x    ];
 858 #if 0
 859     {
 860     int A1,A2,A3,A4;
 861     A2= temp[1       ];
 862     A4= temp[0       ];
 863     A1= temp[0+width2];
 864     A1 -= (A2 + A4)>>1;
 865     A4 += (A1 + 1)>>1;
 866     b[0+width2] = A1;
 867     b[0       ] = A4;
 868     for(x=1; x+1<width2; x+=2){
 869         A3= temp[x+width2];
 870         A4= temp[x+1     ];
 871         A3 -= (A2 + A4)>>1;
 872         A2 += (A1 + A3 + 2)>>2;
 873         b[x+width2] = A3;
 874         b[x       ] = A2;
 875
 876         A1= temp[x+1+width2];
 877         A2= temp[x+2       ];
 878         A1 -= (A2 + A4)>>1;
 879         A4 += (A1 + A3 + 2)>>2;
 880         b[x+1+width2] = A1;
 881         b[x+1       ] = A4;
 882     }
 883     A3= temp[width-1];
 884     A3 -= A2;
 885     A2 += (A1 + A3 + 2)>>2;
 886     b[width -1] = A3;
 887     b[width2-1] = A2;
 888     }
 889 #else
 890     lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
 891     lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
 892 #endif /* 0 */
 893 }
 894
 895 static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 896     int i;
 897
 898     for(i=0; i<width; i++){
 899         b1[i] -= (b0[i] + b2[i])>>1;
 900     }
 901 }
 902
 903 static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 904     int i;
 905
 906     for(i=0; i<width; i++){
 907         b1[i] += (b0[i] + b2[i] + 2)>>2;
 908     }
 909 }
 910
 911 static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
 912     int y;
 913     DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
 914     DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
 915
 916     for(y=-2; y<height; y+=2){
 917         DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
 918         DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
 919
 920         if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
 921         if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
 922
 923         if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
 924         if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
 925
 926         b0=b2;
 927         b1=b3;
 928     }
 929 }
 930
 931 static void horizontal_decompose97i(DWTELEM *b, int width){
 932     DWTELEM temp[width];
 933     const int w2= (width+1)>>1;
 934
 935     lift (temp+w2, b    +1, b      , 1, 2, 2, width,  W_AM, W_AO, W_AS, 1, 1);
 936     liftS(temp   , b      , temp+w2, 1, 2, 1, width,  W_BM, W_BO, W_BS, 0, 0);
 937     lift (b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
 938     lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
 939 }
 940
 941
 942 static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 943     int i;
 944
 945     for(i=0; i<width; i++){
 946         b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
 947     }
 948 }
 949
 950 static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 951     int i;
 952
 953     for(i=0; i<width; i++){
 954         b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
 955     }
 956 }
 957
 958 static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 959     int i;
 960
 961     for(i=0; i<width; i++){
 962 #ifdef liftS
 963         b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
 964 #else
 965         b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + W_BO*5 + (5<<27)) / (5*16) - (1<<23);
 966 #endif
 967     }
 968 }
 969
 970 static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 971     int i;
 972
 973     for(i=0; i<width; i++){
 974         b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
 975     }
 976 }
 977
 978 static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
 979     int y;
 980     DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
 981     DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
 982     DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
 983     DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
 984
 985     for(y=-4; y<height; y+=2){
 986         DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
 987         DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
 988
 989         if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
 990         if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
 991
 992         if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
 993         if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
 994         if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
 995         if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
 996
 997         b0=b2;
 998         b1=b3;
 999         b2=b4;
1000         b3=b5;
1001     }
1002 }
1003
1004 void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1005     int level;
1006
1007     for(level=0; level<decomposition_count; level++){
1008         switch(type){
1009         case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1010         case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1011         }
1012     }
1013 }
1014
1015 static void horizontal_compose53i(IDWTELEM *b, int width){
1016     IDWTELEM temp[width];
1017     const int width2= width>>1;
1018     const int w2= (width+1)>>1;
1019     int x;
1020
1021 #if 0
1022     int A1,A2,A3,A4;
1023     A2= temp[1       ];
1024     A4= temp[0       ];
1025     A1= temp[0+width2];
1026     A1 -= (A2 + A4)>>1;
1027     A4 += (A1 + 1)>>1;
1028     b[0+width2] = A1;
1029     b[0       ] = A4;
1030     for(x=1; x+1<width2; x+=2){
1031         A3= temp[x+width2];
1032         A4= temp[x+1     ];
1033         A3 -= (A2 + A4)>>1;
1034         A2 += (A1 + A3 + 2)>>2;
1035         b[x+width2] = A3;
1036         b[x       ] = A2;
1037
1038         A1= temp[x+1+width2];
1039         A2= temp[x+2       ];
1040         A1 -= (A2 + A4)>>1;
1041         A4 += (A1 + A3 + 2)>>2;
1042         b[x+1+width2] = A1;
1043         b[x+1       ] = A4;
1044     }
1045     A3= temp[width-1];
1046     A3 -= A2;
1047     A2 += (A1 + A3 + 2)>>2;
1048     b[width -1] = A3;
1049     b[width2-1] = A2;
1050 #else
1051     inv_lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1052     inv_lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1053 #endif /* 0 */
1054     for(x=0; x<width2; x++){
1055         b[2*x    ]= temp[x   ];
1056         b[2*x + 1]= temp[x+w2];
1057     }
1058     if(width&1)
1059         b[2*x    ]= temp[x   ];
1060 }
1061
1062 static void vertical_compose53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1063     int i;
1064
1065     for(i=0; i<width; i++){
1066         b1[i] += (b0[i] + b2[i])>>1;
1067     }
1068 }
1069
1070 static void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1071     int i;
1072
1073     for(i=0; i<width; i++){
1074         b1[i] -= (b0[i] + b2[i] + 2)>>2;
1075     }
1076 }
1077
1078 static void spatial_compose53i_buffered_init(DWTCompose *cs, slice_buffer * sb, int height, int stride_line){
1079     cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1080     cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1081     cs->y = -1;
1082 }
1083
1084 static void spatial_compose53i_init(DWTCompose *cs, IDWTELEM *buffer, int height, int stride){
1085     cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1086     cs->b1 = buffer + mirror(-1  , height-1)*stride;
1087     cs->y = -1;
1088 }
1089
1090 static void spatial_compose53i_dy_buffered(DWTCompose *cs, slice_buffer * sb, int width, int height, int stride_line){
1091     int y= cs->y;
1092
1093     IDWTELEM *b0= cs->b0;
1094     IDWTELEM *b1= cs->b1;
1095     IDWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1096     IDWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1097
1098         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1099         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1100
1101         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1102         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1103
1104     cs->b0 = b2;
1105     cs->b1 = b3;
1106     cs->y += 2;
1107 }
1108
1109 static void spatial_compose53i_dy(DWTCompose *cs, IDWTELEM *buffer, int width, int height, int stride){
1110     int y= cs->y;
1111     IDWTELEM *b0= cs->b0;
1112     IDWTELEM *b1= cs->b1;
1113     IDWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1114     IDWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1115
1116         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1117         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1118
1119         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1120         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1121
1122     cs->b0 = b2;
1123     cs->b1 = b3;
1124     cs->y += 2;
1125 }
1126
1127 static void av_unused spatial_compose53i(IDWTELEM *buffer, int width, int height, int stride){
1128     DWTCompose cs;
1129     spatial_compose53i_init(&cs, buffer, height, stride);
1130     while(cs.y <= height)
1131         spatial_compose53i_dy(&cs, buffer, width, height, stride);
1132 }
1133
1134
1135 void ff_snow_horizontal_compose97i(IDWTELEM *b, int width){
1136     IDWTELEM temp[width];
1137     const int w2= (width+1)>>1;
1138
1139     inv_lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1140     inv_lift (temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1141     inv_liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO, W_BS, 0, 1);
1142     inv_lift (b+1    , temp+w2, b      , 2, 1, 2, width,  W_AM, W_AO, W_AS, 1, 0);
1143 }
1144
1145 static void vertical_compose97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1146     int i;
1147
1148     for(i=0; i<width; i++){
1149         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1150     }
1151 }
1152
1153 static void vertical_compose97iH1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1154     int i;
1155
1156     for(i=0; i<width; i++){
1157         b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1158     }
1159 }
1160
1161 static void vertical_compose97iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1162     int i;
1163
1164     for(i=0; i<width; i++){
1165 #ifdef liftS
1166         b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1167 #else
1168         b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1169 #endif
1170     }
1171 }
1172
1173 static void vertical_compose97iL1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1174     int i;
1175
1176     for(i=0; i<width; i++){
1177         b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1178     }
1179 }
1180
1181 void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
1182     int i;
1183
1184     for(i=0; i<width; i++){
1185         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1186         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1187 #ifdef liftS
1188         b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1189 #else
1190         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1191 #endif
1192         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1193     }
1194 }
1195
1196 static void spatial_compose97i_buffered_init(DWTCompose *cs, slice_buffer * sb, int height, int stride_line){
1197     cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1198     cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1199     cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1200     cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1201     cs->y = -3;
1202 }
1203
1204 static void spatial_compose97i_init(DWTCompose *cs, IDWTELEM *buffer, int height, int stride){
1205     cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1206     cs->b1 = buffer + mirror(-3  , height-1)*stride;
1207     cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1208     cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1209     cs->y = -3;
1210 }
1211
1212 static void spatial_compose97i_dy_buffered(DSPContext *dsp, DWTCompose *cs, slice_buffer * sb, int width, int height, int stride_line){
1213     int y = cs->y;
1214
1215     IDWTELEM *b0= cs->b0;
1216     IDWTELEM *b1= cs->b1;
1217     IDWTELEM *b2= cs->b2;
1218     IDWTELEM *b3= cs->b3;
1219     IDWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1220     IDWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1221
1222     if(y>0 && y+4<height){
1223         dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1224     }else{
1225         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1226         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1227         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1228         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1229     }
1230
1231     if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1232     if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1233
1234     cs->b0=b2;
1235     cs->b1=b3;
1236     cs->b2=b4;
1237     cs->b3=b5;
1238     cs->y += 2;
1239 }
1240
1241 static void spatial_compose97i_dy(DWTCompose *cs, IDWTELEM *buffer, int width, int height, int stride){
1242     int y = cs->y;
1243     IDWTELEM *b0= cs->b0;
1244     IDWTELEM *b1= cs->b1;
1245     IDWTELEM *b2= cs->b2;
1246     IDWTELEM *b3= cs->b3;
1247     IDWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1248     IDWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1249
1250     if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1251     if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1252     if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1253     if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1254
1255     if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1256     if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1257
1258     cs->b0=b2;
1259     cs->b1=b3;
1260     cs->b2=b4;
1261     cs->b3=b5;
1262     cs->y += 2;
1263 }
1264
1265 static void av_unused spatial_compose97i(IDWTELEM *buffer, int width, int height, int stride){
1266     DWTCompose cs;
1267     spatial_compose97i_init(&cs, buffer, height, stride);
1268     while(cs.y <= height)
1269         spatial_compose97i_dy(&cs, buffer, width, height, stride);
1270 }
1271
1272 static void ff_spatial_idwt_buffered_init(DWTCompose *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1273     int level;
1274     for(level=decomposition_count-1; level>=0; level--){
1275         switch(type){
1276         case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1277         case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1278         }
1279     }
1280 }
1281
1282 static void ff_spatial_idwt_init(DWTCompose *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1283     int level;
1284     for(level=decomposition_count-1; level>=0; level--){
1285         switch(type){
1286         case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1287         case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1288         }
1289     }
1290 }
1291
1292 static void ff_spatial_idwt_slice(DWTCompose *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1293     const int support = type==1 ? 3 : 5;
1294     int level;
1295     if(type==2) return;
1296
1297     for(level=decomposition_count-1; level>=0; level--){
1298         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1299             switch(type){
1300             case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1301                 break;
1302             case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1303                 break;
1304             }
1305         }
1306     }
1307 }
1308
1309 static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, DWTCompose *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1310     const int support = type==1 ? 3 : 5;
1311     int level;
1312     if(type==2) return;
1313
1314     for(level=decomposition_count-1; level>=0; level--){
1315         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1316             switch(type){
1317             case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1318                 break;
1319             case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1320                 break;
1321             }
1322         }
1323     }
1324 }
1325
1326 static void ff_spatial_idwt(IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1327         DWTCompose cs[MAX_DECOMPOSITIONS];
1328         int y;
1329         ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1330         for(y=0; y<height; y+=4)
1331             ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1332 }
1333
1334 static int encode_subband_c0run(SnowContext *s, SubBand *b, IDWTELEM *src, IDWTELEM *parent, int stride, int orientation){
1335     const int w= b->width;
1336     const int h= b->height;
1337     int x, y;
1338
1339     if(1){
1340         int run=0;
1341         int runs[w*h];
1342         int run_index=0;
1343         int max_index;
1344
1345         for(y=0; y<h; y++){
1346             for(x=0; x<w; x++){
1347                 int v, p=0;
1348                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1349                 v= src[x + y*stride];
1350
1351                 if(y){
1352                     t= src[x + (y-1)*stride];
1353                     if(x){
1354                         lt= src[x - 1 + (y-1)*stride];
1355                     }
1356                     if(x + 1 < w){
1357                         rt= src[x + 1 + (y-1)*stride];
1358                     }
1359                 }
1360                 if(x){
1361                     l= src[x - 1 + y*stride];
1362                     /*if(x > 1){
1363                         if(orientation==1) ll= src[y + (x-2)*stride];
1364                         else               ll= src[x - 2 + y*stride];
1365                     }*/
1366                 }
1367                 if(parent){
1368                     int px= x>>1;
1369                     int py= y>>1;
1370                     if(px<b->parent->width && py<b->parent->height)
1371                         p= parent[px + py*2*stride];
1372                 }
1373                 if(!(/*ll|*/l|lt|t|rt|p)){
1374                     if(v){
1375                         runs[run_index++]= run;
1376                         run=0;
1377                     }else{
1378                         run++;
1379                     }
1380                 }
1381             }
1382         }
1383         max_index= run_index;
1384         runs[run_index++]= run;
1385         run_index=0;
1386         run= runs[run_index++];
1387
1388         put_symbol2(&s->c, b->state[30], max_index, 0);
1389         if(run_index <= max_index)
1390             put_symbol2(&s->c, b->state[1], run, 3);
1391
1392         for(y=0; y<h; y++){
1393             if(s->c.bytestream_end - s->c.bytestream < w*40){
1394                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1395                 return -1;
1396             }
1397             for(x=0; x<w; x++){
1398                 int v, p=0;
1399                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1400                 v= src[x + y*stride];
1401
1402                 if(y){
1403                     t= src[x + (y-1)*stride];
1404                     if(x){
1405                         lt= src[x - 1 + (y-1)*stride];
1406                     }
1407                     if(x + 1 < w){
1408                         rt= src[x + 1 + (y-1)*stride];
1409                     }
1410                 }
1411                 if(x){
1412                     l= src[x - 1 + y*stride];
1413                     /*if(x > 1){
1414                         if(orientation==1) ll= src[y + (x-2)*stride];
1415                         else               ll= src[x - 2 + y*stride];
1416                     }*/
1417                 }
1418                 if(parent){
1419                     int px= x>>1;
1420                     int py= y>>1;
1421                     if(px<b->parent->width && py<b->parent->height)
1422                         p= parent[px + py*2*stride];
1423                 }
1424                 if(/*ll|*/l|lt|t|rt|p){
1425                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1426
1427                     put_rac(&s->c, &b->state[0][context], !!v);
1428                 }else{
1429                     if(!run){
1430                         run= runs[run_index++];
1431
1432                         if(run_index <= max_index)
1433                             put_symbol2(&s->c, b->state[1], run, 3);
1434                         assert(v);
1435                     }else{
1436                         run--;
1437                         assert(!v);
1438                     }
1439                 }
1440                 if(v){
1441                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1442                     int l2= 2*FFABS(l) + (l<0);
1443                     int t2= 2*FFABS(t) + (t<0);
1444
1445                     put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1446                     put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1447                 }
1448             }
1449         }
1450     }
1451     return 0;
1452 }
1453
1454 static int encode_subband(SnowContext *s, SubBand *b, IDWTELEM *src, IDWTELEM *parent, int stride, int orientation){
1455 //    encode_subband_qtree(s, b, src, parent, stride, orientation);
1456 //    encode_subband_z0run(s, b, src, parent, stride, orientation);
1457     return encode_subband_c0run(s, b, src, parent, stride, orientation);
1458 //    encode_subband_dzr(s, b, src, parent, stride, orientation);
1459 }
1460
1461 static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1462     const int w= b->width;
1463     const int h= b->height;
1464     int x,y;
1465
1466     if(1){
1467         int run, runs;
1468         x_and_coeff *xc= b->x_coeff;
1469         x_and_coeff *prev_xc= NULL;
1470         x_and_coeff *prev2_xc= xc;
1471         x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1472         x_and_coeff *prev_parent_xc= parent_xc;
1473
1474         runs= get_symbol2(&s->c, b->state[30], 0);
1475         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1476         else           run= INT_MAX;
1477
1478         for(y=0; y<h; y++){
1479             int v=0;
1480             int lt=0, t=0, rt=0;
1481
1482             if(y && prev_xc->x == 0){
1483                 rt= prev_xc->coeff;
1484             }
1485             for(x=0; x<w; x++){
1486                 int p=0;
1487                 const int l= v;
1488
1489                 lt= t; t= rt;
1490
1491                 if(y){
1492                     if(prev_xc->x <= x)
1493                         prev_xc++;
1494                     if(prev_xc->x == x + 1)
1495                         rt= prev_xc->coeff;
1496                     else
1497                         rt=0;
1498                 }
1499                 if(parent_xc){
1500                     if(x>>1 > parent_xc->x){
1501                         parent_xc++;
1502                     }
1503                     if(x>>1 == parent_xc->x){
1504                         p= parent_xc->coeff;
1505                     }
1506                 }
1507                 if(/*ll|*/l|lt|t|rt|p){
1508                     int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1509
1510                     v=get_rac(&s->c, &b->state[0][context]);
1511                     if(v){
1512                         v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1513                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1514
1515                         xc->x=x;
1516                         (xc++)->coeff= v;
1517                     }
1518                 }else{
1519                     if(!run){
1520                         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1521                         else           run= INT_MAX;
1522                         v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1523                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1524
1525                         xc->x=x;
1526                         (xc++)->coeff= v;
1527                     }else{
1528                         int max_run;
1529                         run--;
1530                         v=0;
1531
1532                         if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1533                         else  max_run= FFMIN(run, w-x-1);
1534                         if(parent_xc)
1535                             max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1536                         x+= max_run;
1537                         run-= max_run;
1538                     }
1539                 }
1540             }
1541             (xc++)->x= w+1; //end marker
1542             prev_xc= prev2_xc;
1543             prev2_xc= xc;
1544
1545             if(parent_xc){
1546                 if(y&1){
1547                     while(parent_xc->x != parent->width+1)
1548                         parent_xc++;
1549                     parent_xc++;
1550                     prev_parent_xc= parent_xc;
1551                 }else{
1552                     parent_xc= prev_parent_xc;
1553                 }
1554             }
1555         }
1556
1557         (xc++)->x= w+1; //end marker
1558     }
1559 }
1560
1561 static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1562     const int w= b->width;
1563     int y;
1564     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
1565     int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1566     int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1567     int new_index = 0;
1568
1569     if(b->ibuf == s->spatial_idwt_buffer || s->qlog == LOSSLESS_QLOG){
1570         qadd= 0;
1571         qmul= 1<<QEXPSHIFT;
1572     }
1573
1574     /* If we are on the second or later slice, restore our index. */
1575     if (start_y != 0)
1576         new_index = save_state[0];
1577
1578
1579     for(y=start_y; y<h; y++){
1580         int x = 0;
1581         int v;
1582         IDWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1583         memset(line, 0, b->width*sizeof(IDWTELEM));
1584         v = b->x_coeff[new_index].coeff;
1585         x = b->x_coeff[new_index++].x;
1586         while(x < w){
1587             register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1588             register int u= -(v&1);
1589             line[x] = (t^u) - u;
1590
1591             v = b->x_coeff[new_index].coeff;
1592             x = b->x_coeff[new_index++].x;
1593         }
1594     }
1595
1596     /* Save our variables for the next slice. */
1597     save_state[0] = new_index;
1598
1599     return;
1600 }
1601
1602 static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
1603     int plane_index, level, orientation;
1604
1605     for(plane_index=0; plane_index<3; plane_index++){
1606         for(level=0; level<MAX_DECOMPOSITIONS; level++){
1607             for(orientation=level ? 1:0; orientation<4; orientation++){
1608                 memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1609             }
1610         }
1611     }
1612     memset(s->header_state, MID_STATE, sizeof(s->header_state));
1613     memset(s->block_state, MID_STATE, sizeof(s->block_state));
1614 }
1615
1616 static int alloc_blocks(SnowContext *s){
1617     int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1618     int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1619
1620     s->b_width = w;
1621     s->b_height= h;
1622
1623     av_free(s->block);
1624     s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1625     return 0;
1626 }
1627
1628 static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1629     uint8_t *bytestream= d->bytestream;
1630     uint8_t *bytestream_start= d->bytestream_start;
1631     *d= *s;
1632     d->bytestream= bytestream;
1633     d->bytestream_start= bytestream_start;
1634 }
1635
1636 //near copy & paste from dsputil, FIXME
1637 static int pix_sum(uint8_t * pix, int line_size, int w)
1638 {
1639     int s, i, j;
1640
1641     s = 0;
1642     for (i = 0; i < w; i++) {
1643         for (j = 0; j < w; j++) {
1644             s += pix[0];
1645             pix ++;
1646         }
1647         pix += line_size - w;
1648     }
1649     return s;
1650 }
1651
1652 //near copy & paste from dsputil, FIXME
1653 static int pix_norm1(uint8_t * pix, int line_size, int w)
1654 {
1655     int s, i, j;
1656     uint32_t *sq = ff_squareTbl + 256;
1657
1658     s = 0;
1659     for (i = 0; i < w; i++) {
1660         for (j = 0; j < w; j ++) {
1661             s += sq[pix[0]];
1662             pix ++;
1663         }
1664         pix += line_size - w;
1665     }
1666     return s;
1667 }
1668
1669 static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1670     const int w= s->b_width << s->block_max_depth;
1671     const int rem_depth= s->block_max_depth - level;
1672     const int index= (x + y*w) << rem_depth;
1673     const int block_w= 1<<rem_depth;
1674     BlockNode block;
1675     int i,j;
1676
1677     block.color[0]= l;
1678     block.color[1]= cb;
1679     block.color[2]= cr;
1680     block.mx= mx;
1681     block.my= my;
1682     block.ref= ref;
1683     block.type= type;
1684     block.level= level;
1685
1686     for(j=0; j<block_w; j++){
1687         for(i=0; i<block_w; i++){
1688             s->block[index + i + j*w]= block;
1689         }
1690     }
1691 }
1692
1693 static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1694     const int offset[3]= {
1695           y*c->  stride + x,
1696         ((y*c->uvstride + x)>>1),
1697         ((y*c->uvstride + x)>>1),
1698     };
1699     int i;
1700     for(i=0; i<3; i++){
1701         c->src[0][i]= src [i];
1702         c->ref[0][i]= ref [i] + offset[i];
1703     }
1704     assert(!ref_index);
1705 }
1706
1707 static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1708                            const BlockNode *left, const BlockNode *top, const BlockNode *tr){
1709     if(s->ref_frames == 1){
1710         *mx = mid_pred(left->mx, top->mx, tr->mx);
1711         *my = mid_pred(left->my, top->my, tr->my);
1712     }else{
1713         const int *scale = scale_mv_ref[ref];
1714         *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
1715                        (top ->mx * scale[top ->ref] + 128) >>8,
1716                        (tr  ->mx * scale[tr  ->ref] + 128) >>8);
1717         *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
1718                        (top ->my * scale[top ->ref] + 128) >>8,
1719                        (tr  ->my * scale[tr  ->ref] + 128) >>8);
1720     }
1721 }
1722
1723 //FIXME copy&paste
1724 #define P_LEFT P[1]
1725 #define P_TOP P[2]
1726 #define P_TOPRIGHT P[3]
1727 #define P_MEDIAN P[4]
1728 #define P_MV1 P[9]
1729 #define FLAG_QPEL   1 //must be 1
1730
1731 static int encode_q_branch(SnowContext *s, int level, int x, int y){
1732     uint8_t p_buffer[1024];
1733     uint8_t i_buffer[1024];
1734     uint8_t p_state[sizeof(s->block_state)];
1735     uint8_t i_state[sizeof(s->block_state)];
1736     RangeCoder pc, ic;
1737     uint8_t *pbbak= s->c.bytestream;
1738     uint8_t *pbbak_start= s->c.bytestream_start;
1739     int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
1740     const int w= s->b_width  << s->block_max_depth;
1741     const int h= s->b_height << s->block_max_depth;
1742     const int rem_depth= s->block_max_depth - level;
1743     const int index= (x + y*w) << rem_depth;
1744     const int block_w= 1<<(LOG2_MB_SIZE - level);
1745     int trx= (x+1)<<rem_depth;
1746     int try= (y+1)<<rem_depth;
1747     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
1748     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
1749     const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
1750     const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
1751     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
1752     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
1753     int pl = left->color[0];
1754     int pcb= left->color[1];
1755     int pcr= left->color[2];
1756     int pmx, pmy;
1757     int mx=0, my=0;
1758     int l,cr,cb;
1759     const int stride= s->current_picture.linesize[0];
1760     const int uvstride= s->current_picture.linesize[1];
1761     uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
1762                                 s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
1763                                 s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
1764     int P[10][2];
1765     int16_t last_mv[3][2];
1766     int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
1767     const int shift= 1+qpel;
1768     MotionEstContext *c= &s->m.me;
1769     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
1770     int mx_context= av_log2(2*FFABS(left->mx - top->mx));
1771     int my_context= av_log2(2*FFABS(left->my - top->my));
1772     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
1773     int ref, best_ref, ref_score, ref_mx, ref_my;
1774
1775     assert(sizeof(s->block_state) >= 256);
1776     if(s->keyframe){
1777         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
1778         return 0;
1779     }
1780
1781 //    clip predictors / edge ?
1782
1783     P_LEFT[0]= left->mx;
1784     P_LEFT[1]= left->my;
1785     P_TOP [0]= top->mx;
1786     P_TOP [1]= top->my;
1787     P_TOPRIGHT[0]= tr->mx;
1788     P_TOPRIGHT[1]= tr->my;
1789
1790     last_mv[0][0]= s->block[index].mx;
1791     last_mv[0][1]= s->block[index].my;
1792     last_mv[1][0]= right->mx;
1793     last_mv[1][1]= right->my;
1794     last_mv[2][0]= bottom->mx;
1795     last_mv[2][1]= bottom->my;
1796
1797     s->m.mb_stride=2;
1798     s->m.mb_x=
1799     s->m.mb_y= 0;
1800     c->skip= 0;
1801
1802     assert(c->  stride ==   stride);
1803     assert(c->uvstride == uvstride);
1804
1805     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
1806     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
1807     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
1808     c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
1809
1810     c->xmin = - x*block_w - 16+3;
1811     c->ymin = - y*block_w - 16+3;
1812     c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
1813     c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
1814
1815     if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
1816     if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
1817     if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
1818     if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
1819     if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
1820     if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
1821     if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
1822
1823     P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
1824     P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
1825
1826     if (!y) {
1827         c->pred_x= P_LEFT[0];
1828         c->pred_y= P_LEFT[1];
1829     } else {
1830         c->pred_x = P_MEDIAN[0];
1831         c->pred_y = P_MEDIAN[1];
1832     }
1833
1834     score= INT_MAX;
1835     best_ref= 0;
1836     for(ref=0; ref<s->ref_frames; ref++){
1837         init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
1838
1839         ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
1840                                          (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
1841
1842         assert(ref_mx >= c->xmin);
1843         assert(ref_mx <= c->xmax);
1844         assert(ref_my >= c->ymin);
1845         assert(ref_my <= c->ymax);
1846
1847         ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
1848         ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
1849         ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
1850         if(s->ref_mvs[ref]){
1851             s->ref_mvs[ref][index][0]= ref_mx;
1852             s->ref_mvs[ref][index][1]= ref_my;
1853             s->ref_scores[ref][index]= ref_score;
1854         }
1855         if(score > ref_score){
1856             score= ref_score;
1857             best_ref= ref;
1858             mx= ref_mx;
1859             my= ref_my;
1860         }
1861     }
1862     //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
1863
1864   //  subpel search
1865     base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
1866     pc= s->c;
1867     pc.bytestream_start=
1868     pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
1869     memcpy(p_state, s->block_state, sizeof(s->block_state));
1870
1871     if(level!=s->block_max_depth)
1872         put_rac(&pc, &p_state[4 + s_context], 1);
1873     put_rac(&pc, &p_state[1 + left->type + top->type], 0);
1874     if(s->ref_frames > 1)
1875         put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
1876     pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
1877     put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
1878     put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
1879     p_len= pc.bytestream - pc.bytestream_start;
1880     score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
1881
1882     block_s= block_w*block_w;
1883     sum = pix_sum(current_data[0], stride, block_w);
1884     l= (sum + block_s/2)/block_s;
1885     iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
1886
1887     block_s= block_w*block_w>>2;
1888     sum = pix_sum(current_data[1], uvstride, block_w>>1);
1889     cb= (sum + block_s/2)/block_s;
1890 //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
1891     sum = pix_sum(current_data[2], uvstride, block_w>>1);
1892     cr= (sum + block_s/2)/block_s;
1893 //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
1894
1895     ic= s->c;
1896     ic.bytestream_start=
1897     ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
1898     memcpy(i_state, s->block_state, sizeof(s->block_state));
1899     if(level!=s->block_max_depth)
1900         put_rac(&ic, &i_state[4 + s_context], 1);
1901     put_rac(&ic, &i_state[1 + left->type + top->type], 1);
1902     put_symbol(&ic, &i_state[32],  l-pl , 1);
1903     put_symbol(&ic, &i_state[64], cb-pcb, 1);
1904     put_symbol(&ic, &i_state[96], cr-pcr, 1);
1905     i_len= ic.bytestream - ic.bytestream_start;
1906     iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
1907
1908 //    assert(score==256*256*256*64-1);
1909     assert(iscore < 255*255*256 + s->lambda2*10);
1910     assert(iscore >= 0);
1911     assert(l>=0 && l<=255);
1912     assert(pl>=0 && pl<=255);
1913
1914     if(level==0){
1915         int varc= iscore >> 8;
1916         int vard= score >> 8;
1917         if (vard <= 64 || vard < varc)
1918             c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
1919         else
1920             c->scene_change_score+= s->m.qscale;
1921     }
1922
1923     if(level!=s->block_max_depth){
1924         put_rac(&s->c, &s->block_state[4 + s_context], 0);
1925         score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
1926         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
1927         score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
1928         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
1929         score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
1930
1931         if(score2 < score && score2 < iscore)
1932             return score2;
1933     }
1934
1935     if(iscore < score){
1936         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
1937         memcpy(pbbak, i_buffer, i_len);
1938         s->c= ic;
1939         s->c.bytestream_start= pbbak_start;
1940         s->c.bytestream= pbbak + i_len;
1941         set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
1942         memcpy(s->block_state, i_state, sizeof(s->block_state));
1943         return iscore;
1944     }else{
1945         memcpy(pbbak, p_buffer, p_len);
1946         s->c= pc;
1947         s->c.bytestream_start= pbbak_start;
1948         s->c.bytestream= pbbak + p_len;
1949         set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
1950         memcpy(s->block_state, p_state, sizeof(s->block_state));
1951         return score;
1952     }
1953 }
1954
1955 static av_always_inline int same_block(BlockNode *a, BlockNode *b){
1956     if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
1957         return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
1958     }else{
1959         return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
1960     }
1961 }
1962
1963 static void encode_q_branch2(SnowContext *s, int level, int x, int y){
1964     const int w= s->b_width  << s->block_max_depth;
1965     const int rem_depth= s->block_max_depth - level;
1966     const int index= (x + y*w) << rem_depth;
1967     int trx= (x+1)<<rem_depth;
1968     BlockNode *b= &s->block[index];
1969     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
1970     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
1971     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
1972     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
1973     int pl = left->color[0];
1974     int pcb= left->color[1];
1975     int pcr= left->color[2];
1976     int pmx, pmy;
1977     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
1978     int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
1979     int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
1980     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
1981
1982     if(s->keyframe){
1983         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
1984         return;
1985     }
1986
1987     if(level!=s->block_max_depth){
1988         if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
1989             put_rac(&s->c, &s->block_state[4 + s_context], 1);
1990         }else{
1991             put_rac(&s->c, &s->block_state[4 + s_context], 0);
1992             encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
1993             encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
1994             encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
1995             encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
1996             return;
1997         }
1998     }
1999     if(b->type & BLOCK_INTRA){
2000         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2001         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2002         put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2003         put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2004         put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2005         set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2006     }else{
2007         pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2008         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2009         if(s->ref_frames > 1)
2010             put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2011         put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2012         put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2013         set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2014     }
2015 }
2016
2017 static void decode_q_branch(SnowContext *s, int level, int x, int y){
2018     const int w= s->b_width << s->block_max_depth;
2019     const int rem_depth= s->block_max_depth - level;
2020     const int index= (x + y*w) << rem_depth;
2021     int trx= (x+1)<<rem_depth;
2022     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2023     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2024     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2025     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2026     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2027
2028     if(s->keyframe){
2029         set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2030         return;
2031     }
2032
2033     if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2034         int type, mx, my;
2035         int l = left->color[0];
2036         int cb= left->color[1];
2037         int cr= left->color[2];
2038         int ref = 0;
2039         int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2040         int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2041         int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2042
2043         type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2044
2045         if(type){
2046             pred_mv(s, &mx, &my, 0, left, top, tr);
2047             l += get_symbol(&s->c, &s->block_state[32], 1);
2048             cb+= get_symbol(&s->c, &s->block_state[64], 1);
2049             cr+= get_symbol(&s->c, &s->block_state[96], 1);
2050         }else{
2051             if(s->ref_frames > 1)
2052                 ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2053             pred_mv(s, &mx, &my, ref, left, top, tr);
2054             mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2055             my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2056         }
2057         set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2058     }else{
2059         decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2060         decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2061         decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2062         decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2063     }
2064 }
2065
2066 static void encode_blocks(SnowContext *s, int search){
2067     int x, y;
2068     int w= s->b_width;
2069     int h= s->b_height;
2070
2071     if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2072         iterative_me(s);
2073
2074     for(y=0; y<h; y++){
2075         if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2076             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2077             return;
2078         }
2079         for(x=0; x<w; x++){
2080             if(s->avctx->me_method == ME_ITER || !search)
2081                 encode_q_branch2(s, 0, x, y);
2082             else
2083                 encode_q_branch (s, 0, x, y);
2084         }
2085     }
2086 }
2087
2088 static void decode_blocks(SnowContext *s){
2089     int x, y;
2090     int w= s->b_width;
2091     int h= s->b_height;
2092
2093     for(y=0; y<h; y++){
2094         for(x=0; x<w; x++){
2095             decode_q_branch(s, 0, x, y);
2096         }
2097     }
2098 }
2099
2100 static void mc_block(Plane *p, uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2101     static const uint8_t weight[64]={
2102     8,7,6,5,4,3,2,1,
2103     7,7,0,0,0,0,0,1,
2104     6,0,6,0,0,0,2,0,
2105     5,0,0,5,0,3,0,0,
2106     4,0,0,0,4,0,0,0,
2107     3,0,0,5,0,3,0,0,
2108     2,0,6,0,0,0,2,0,
2109     1,7,0,0,0,0,0,1,
2110     };
2111
2112     static const uint8_t brane[256]={
2113     0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x11,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
2114     0x04,0x05,0xcc,0xcc,0xcc,0xcc,0xcc,0x41,0x15,0x16,0xcc,0xcc,0xcc,0xcc,0xcc,0x52,
2115     0x04,0xcc,0x05,0xcc,0xcc,0xcc,0x41,0xcc,0x15,0xcc,0x16,0xcc,0xcc,0xcc,0x52,0xcc,
2116     0x04,0xcc,0xcc,0x05,0xcc,0x41,0xcc,0xcc,0x15,0xcc,0xcc,0x16,0xcc,0x52,0xcc,0xcc,
2117     0x04,0xcc,0xcc,0xcc,0x41,0xcc,0xcc,0xcc,0x15,0xcc,0xcc,0xcc,0x16,0xcc,0xcc,0xcc,
2118     0x04,0xcc,0xcc,0x41,0xcc,0x05,0xcc,0xcc,0x15,0xcc,0xcc,0x52,0xcc,0x16,0xcc,0xcc,
2119     0x04,0xcc,0x41,0xcc,0xcc,0xcc,0x05,0xcc,0x15,0xcc,0x52,0xcc,0xcc,0xcc,0x16,0xcc,
2120     0x04,0x41,0xcc,0xcc,0xcc,0xcc,0xcc,0x05,0x15,0x52,0xcc,0xcc,0xcc,0xcc,0xcc,0x16,
2121     0x44,0x45,0x45,0x45,0x45,0x45,0x45,0x45,0x55,0x56,0x56,0x56,0x56,0x56,0x56,0x56,
2122     0x48,0x49,0xcc,0xcc,0xcc,0xcc,0xcc,0x85,0x59,0x5A,0xcc,0xcc,0xcc,0xcc,0xcc,0x96,
2123     0x48,0xcc,0x49,0xcc,0xcc,0xcc,0x85,0xcc,0x59,0xcc,0x5A,0xcc,0xcc,0xcc,0x96,0xcc,
2124     0x48,0xcc,0xcc,0x49,0xcc,0x85,0xcc,0xcc,0x59,0xcc,0xcc,0x5A,0xcc,0x96,0xcc,0xcc,
2125     0x48,0xcc,0xcc,0xcc,0x49,0xcc,0xcc,0xcc,0x59,0xcc,0xcc,0xcc,0x96,0xcc,0xcc,0xcc,
2126     0x48,0xcc,0xcc,0x85,0xcc,0x49,0xcc,0xcc,0x59,0xcc,0xcc,0x96,0xcc,0x5A,0xcc,0xcc,
2127     0x48,0xcc,0x85,0xcc,0xcc,0xcc,0x49,0xcc,0x59,0xcc,0x96,0xcc,0xcc,0xcc,0x5A,0xcc,
2128     0x48,0x85,0xcc,0xcc,0xcc,0xcc,0xcc,0x49,0x59,0x96,0xcc,0xcc,0xcc,0xcc,0xcc,0x5A,
2129     };
2130
2131     static const uint8_t needs[16]={
2132     0,1,0,0,
2133     2,4,2,0,
2134     0,1,0,0,
2135     15
2136     };
2137
2138     int x, y, b, r, l;
2139     int16_t tmpIt   [64*(32+HTAPS_MAX)];
2140     uint8_t tmp2t[3][stride*(32+HTAPS_MAX)];
2141     int16_t *tmpI= tmpIt;
2142     uint8_t *tmp2= tmp2t[0];
2143     const uint8_t *hpel[11];
2144     assert(dx<16 && dy<16);
2145     r= brane[dx + 16*dy]&15;
2146     l= brane[dx + 16*dy]>>4;
2147
2148     b= needs[l] | needs[r];
2149     if(p && !p->diag_mc)
2150         b= 15;
2151
2152     if(b&5){
2153         for(y=0; y < b_h+HTAPS_MAX-1; y++){
2154             for(x=0; x < b_w; x++){
2155                 int a_1=src[x + HTAPS_MAX/2-4];
2156                 int a0= src[x + HTAPS_MAX/2-3];
2157                 int a1= src[x + HTAPS_MAX/2-2];
2158                 int a2= src[x + HTAPS_MAX/2-1];
2159                 int a3= src[x + HTAPS_MAX/2+0];
2160                 int a4= src[x + HTAPS_MAX/2+1];
2161                 int a5= src[x + HTAPS_MAX/2+2];
2162                 int a6= src[x + HTAPS_MAX/2+3];
2163                 int am=0;
2164                 if(!p || p->fast_mc){
2165                     am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2166                     tmpI[x]= am;
2167                     am= (am+16)>>5;
2168                 }else{
2169                     am= p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6);
2170                     tmpI[x]= am;
2171                     am= (am+32)>>6;
2172                 }
2173
2174                 if(am&(~255)) am= ~(am>>31);
2175                 tmp2[x]= am;
2176             }
2177             tmpI+= 64;
2178             tmp2+= stride;
2179             src += stride;
2180         }
2181         src -= stride*y;
2182     }
2183     src += HTAPS_MAX/2 - 1;
2184     tmp2= tmp2t[1];
2185
2186     if(b&2){
2187         for(y=0; y < b_h; y++){
2188             for(x=0; x < b_w+1; x++){
2189                 int a_1=src[x + (HTAPS_MAX/2-4)*stride];
2190                 int a0= src[x + (HTAPS_MAX/2-3)*stride];
2191                 int a1= src[x + (HTAPS_MAX/2-2)*stride];
2192                 int a2= src[x + (HTAPS_MAX/2-1)*stride];
2193                 int a3= src[x + (HTAPS_MAX/2+0)*stride];
2194                 int a4= src[x + (HTAPS_MAX/2+1)*stride];
2195                 int a5= src[x + (HTAPS_MAX/2+2)*stride];
2196                 int a6= src[x + (HTAPS_MAX/2+3)*stride];
2197                 int am=0;
2198                 if(!p || p->fast_mc)
2199                     am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 16)>>5;
2200                 else
2201                     am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 32)>>6;
2202
2203                 if(am&(~255)) am= ~(am>>31);
2204                 tmp2[x]= am;
2205             }
2206             src += stride;
2207             tmp2+= stride;
2208         }
2209         src -= stride*y;
2210     }
2211     src += stride*(HTAPS_MAX/2 - 1);
2212     tmp2= tmp2t[2];
2213     tmpI= tmpIt;
2214     if(b&4){
2215         for(y=0; y < b_h; y++){
2216             for(x=0; x < b_w; x++){
2217                 int a_1=tmpI[x + (HTAPS_MAX/2-4)*64];
2218                 int a0= tmpI[x + (HTAPS_MAX/2-3)*64];
2219                 int a1= tmpI[x + (HTAPS_MAX/2-2)*64];
2220                 int a2= tmpI[x + (HTAPS_MAX/2-1)*64];
2221                 int a3= tmpI[x + (HTAPS_MAX/2+0)*64];
2222                 int a4= tmpI[x + (HTAPS_MAX/2+1)*64];
2223                 int a5= tmpI[x + (HTAPS_MAX/2+2)*64];
2224                 int a6= tmpI[x + (HTAPS_MAX/2+3)*64];
2225                 int am=0;
2226                 if(!p || p->fast_mc)
2227                     am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 512)>>10;
2228                 else
2229                     am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 2048)>>12;
2230                 if(am&(~255)) am= ~(am>>31);
2231                 tmp2[x]= am;
2232             }
2233             tmpI+= 64;
2234             tmp2+= stride;
2235         }
2236     }
2237
2238     hpel[ 0]= src;
2239     hpel[ 1]= tmp2t[0] + stride*(HTAPS_MAX/2-1);
2240     hpel[ 2]= src + 1;
2241
2242     hpel[ 4]= tmp2t[1];
2243     hpel[ 5]= tmp2t[2];
2244     hpel[ 6]= tmp2t[1] + 1;
2245
2246     hpel[ 8]= src + stride;
2247     hpel[ 9]= hpel[1] + stride;
2248     hpel[10]= hpel[8] + 1;
2249
2250     if(b==15){
2251         const uint8_t *src1= hpel[dx/8 + dy/8*4  ];
2252         const uint8_t *src2= hpel[dx/8 + dy/8*4+1];
2253         const uint8_t *src3= hpel[dx/8 + dy/8*4+4];
2254         const uint8_t *src4= hpel[dx/8 + dy/8*4+5];
2255         dx&=7;
2256         dy&=7;
2257         for(y=0; y < b_h; y++){
2258             for(x=0; x < b_w; x++){
2259                 dst[x]= ((8-dx)*(8-dy)*src1[x] + dx*(8-dy)*src2[x]+
2260                          (8-dx)*   dy *src3[x] + dx*   dy *src4[x]+32)>>6;
2261             }
2262             src1+=stride;
2263             src2+=stride;
2264             src3+=stride;
2265             src4+=stride;
2266             dst +=stride;
2267         }
2268     }else{
2269         const uint8_t *src1= hpel[l];
2270         const uint8_t *src2= hpel[r];
2271         int a= weight[((dx&7) + (8*(dy&7)))];
2272         int b= 8-a;
2273         for(y=0; y < b_h; y++){
2274             for(x=0; x < b_w; x++){
2275                 dst[x]= (a*src1[x] + b*src2[x] + 4)>>3;
2276             }
2277             src1+=stride;
2278             src2+=stride;
2279             dst +=stride;
2280         }
2281     }
2282 }
2283
2284 #define mca(dx,dy,b_w)\
2285 static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, const uint8_t *src, int stride, int h){\
2286     uint8_t tmp[stride*(b_w+HTAPS_MAX-1)];\
2287     assert(h==b_w);\
2288     mc_block(NULL, dst, src-(HTAPS_MAX/2-1)-(HTAPS_MAX/2-1)*stride, tmp, stride, b_w, b_w, dx, dy);\
2289 }
2290
2291 mca( 0, 0,16)
2292 mca( 8, 0,16)
2293 mca( 0, 8,16)
2294 mca( 8, 8,16)
2295 mca( 0, 0,8)
2296 mca( 8, 0,8)
2297 mca( 0, 8,8)
2298 mca( 8, 8,8)
2299
2300 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2301     if(block->type & BLOCK_INTRA){
2302         int x, y;
2303         const int color = block->color[plane_index];
2304         const int color4= color*0x01010101;
2305         if(b_w==32){
2306             for(y=0; y < b_h; y++){
2307                 *(uint32_t*)&dst[0 + y*stride]= color4;
2308                 *(uint32_t*)&dst[4 + y*stride]= color4;
2309                 *(uint32_t*)&dst[8 + y*stride]= color4;
2310                 *(uint32_t*)&dst[12+ y*stride]= color4;
2311                 *(uint32_t*)&dst[16+ y*stride]= color4;
2312                 *(uint32_t*)&dst[20+ y*stride]= color4;
2313                 *(uint32_t*)&dst[24+ y*stride]= color4;
2314                 *(uint32_t*)&dst[28+ y*stride]= color4;
2315             }
2316         }else if(b_w==16){
2317             for(y=0; y < b_h; y++){
2318                 *(uint32_t*)&dst[0 + y*stride]= color4;
2319                 *(uint32_t*)&dst[4 + y*stride]= color4;
2320                 *(uint32_t*)&dst[8 + y*stride]= color4;
2321                 *(uint32_t*)&dst[12+ y*stride]= color4;
2322             }
2323         }else if(b_w==8){
2324             for(y=0; y < b_h; y++){
2325                 *(uint32_t*)&dst[0 + y*stride]= color4;
2326                 *(uint32_t*)&dst[4 + y*stride]= color4;
2327             }
2328         }else if(b_w==4){
2329             for(y=0; y < b_h; y++){
2330                 *(uint32_t*)&dst[0 + y*stride]= color4;
2331             }
2332         }else{
2333             for(y=0; y < b_h; y++){
2334                 for(x=0; x < b_w; x++){
2335                     dst[x + y*stride]= color;
2336                 }
2337             }
2338         }
2339     }else{
2340         uint8_t *src= s->last_picture[block->ref].data[plane_index];
2341         const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2342         int mx= block->mx*scale;
2343         int my= block->my*scale;
2344         const int dx= mx&15;
2345         const int dy= my&15;
2346         const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2347         sx += (mx>>4) - (HTAPS_MAX/2-1);
2348         sy += (my>>4) - (HTAPS_MAX/2-1);
2349         src += sx + sy*stride;
2350         if(   (unsigned)sx >= w - b_w - (HTAPS_MAX-2)
2351            || (unsigned)sy >= h - b_h - (HTAPS_MAX-2)){
2352             ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+HTAPS_MAX-1, b_h+HTAPS_MAX-1, sx, sy, w, h);
2353             src= tmp + MB_SIZE;
2354         }
2355 //        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2356 //        assert(!(b_w&(b_w-1)));
2357         assert(b_w>1 && b_h>1);
2358         assert((tab_index>=0 && tab_index<4) || b_w==32);
2359         if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)) || !s->plane[plane_index].fast_mc )
2360             mc_block(&s->plane[plane_index], dst, src, tmp, stride, b_w, b_h, dx, dy);
2361         else if(b_w==32){
2362             int y;
2363             for(y=0; y<b_h; y+=16){
2364                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 3 + (y+3)*stride,stride);
2365                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 19 + (y+3)*stride,stride);
2366             }
2367         }else if(b_w==b_h)
2368             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 3 + 3*stride,stride);
2369         else if(b_w==2*b_h){
2370             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 3       + 3*stride,stride);
2371             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 3 + b_h + 3*stride,stride);
2372         }else{
2373             assert(2*b_w==b_h);
2374             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 3 + 3*stride           ,stride);
2375             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
2376         }
2377     }
2378 }
2379
2380 void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2381                               int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2382     int y, x;
2383     IDWTELEM * dst;
2384     for(y=0; y<b_h; y++){
2385         //FIXME ugly misuse of obmc_stride
2386         const uint8_t *obmc1= obmc + y*obmc_stride;
2387         const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2388         const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2389         const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2390         dst = slice_buffer_get_line(sb, src_y + y);
2391         for(x=0; x<b_w; x++){
2392             int v=   obmc1[x] * block[3][x + y*src_stride]
2393                     +obmc2[x] * block[2][x + y*src_stride]
2394                     +obmc3[x] * block[1][x + y*src_stride]
2395                     +obmc4[x] * block[0][x + y*src_stride];
2396
2397             v <<= 8 - LOG2_OBMC_MAX;
2398             if(FRAC_BITS != 8){
2399                 v >>= 8 - FRAC_BITS;
2400             }
2401             if(add){
2402                 v += dst[x + src_x];
2403                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2404                 if(v&(~255)) v= ~(v>>31);
2405                 dst8[x + y*src_stride] = v;
2406             }else{
2407                 dst[x + src_x] -= v;
2408             }
2409         }
2410     }
2411 }
2412
2413 //FIXME name cleanup (b_w, block_w, b_width stuff)
2414 static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, IDWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2415     const int b_width = s->b_width  << s->block_max_depth;
2416     const int b_height= s->b_height << s->block_max_depth;
2417     const int b_stride= b_width;
2418     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2419     BlockNode *rt= lt+1;
2420     BlockNode *lb= lt+b_stride;
2421     BlockNode *rb= lb+1;
2422     uint8_t *block[4];
2423     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2424     uint8_t *tmp = s->scratchbuf;
2425     uint8_t *ptmp;
2426     int x,y;
2427
2428     if(b_x<0){
2429         lt= rt;
2430         lb= rb;
2431     }else if(b_x + 1 >= b_width){
2432         rt= lt;
2433         rb= lb;
2434     }
2435     if(b_y<0){
2436         lt= lb;
2437         rt= rb;
2438     }else if(b_y + 1 >= b_height){
2439         lb= lt;
2440         rb= rt;
2441     }
2442
2443     if(src_x<0){ //FIXME merge with prev & always round internal width up to *16
2444         obmc -= src_x;
2445         b_w += src_x;
2446         if(!sliced && !offset_dst)
2447             dst -= src_x;
2448         src_x=0;
2449     }else if(src_x + b_w > w){
2450         b_w = w - src_x;
2451     }
2452     if(src_y<0){
2453         obmc -= src_y*obmc_stride;
2454         b_h += src_y;
2455         if(!sliced && !offset_dst)
2456             dst -= src_y*dst_stride;
2457         src_y=0;
2458     }else if(src_y + b_h> h){
2459         b_h = h - src_y;
2460     }
2461
2462     if(b_w<=0 || b_h<=0) return;
2463
2464     assert(src_stride > 2*MB_SIZE + 5);
2465
2466     if(!sliced && offset_dst)
2467         dst += src_x + src_y*dst_stride;
2468     dst8+= src_x + src_y*src_stride;
2469 //    src += src_x + src_y*src_stride;
2470
2471     ptmp= tmp + 3*tmp_step;
2472     block[0]= ptmp;
2473     ptmp+=tmp_step;
2474     pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2475
2476     if(same_block(lt, rt)){
2477         block[1]= block[0];
2478     }else{
2479         block[1]= ptmp;
2480         ptmp+=tmp_step;
2481         pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2482     }
2483
2484     if(same_block(lt, lb)){
2485         block[2]= block[0];
2486     }else if(same_block(rt, lb)){
2487         block[2]= block[1];
2488     }else{
2489         block[2]= ptmp;
2490         ptmp+=tmp_step;
2491         pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2492     }
2493
2494     if(same_block(lt, rb) ){
2495         block[3]= block[0];
2496     }else if(same_block(rt, rb)){
2497         block[3]= block[1];
2498     }else if(same_block(lb, rb)){
2499         block[3]= block[2];
2500     }else{
2501         block[3]= ptmp;
2502         pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2503     }
2504 #if 0
2505     for(y=0; y<b_h; y++){
2506         for(x=0; x<b_w; x++){
2507             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2508             if(add) dst[x + y*dst_stride] += v;
2509             else    dst[x + y*dst_stride] -= v;
2510         }
2511     }
2512     for(y=0; y<b_h; y++){
2513         uint8_t *obmc2= obmc + (obmc_stride>>1);
2514         for(x=0; x<b_w; x++){
2515             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2516             if(add) dst[x + y*dst_stride] += v;
2517             else    dst[x + y*dst_stride] -= v;
2518         }
2519     }
2520     for(y=0; y<b_h; y++){
2521         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2522         for(x=0; x<b_w; x++){
2523             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2524             if(add) dst[x + y*dst_stride] += v;
2525             else    dst[x + y*dst_stride] -= v;
2526         }
2527     }
2528     for(y=0; y<b_h; y++){
2529         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2530         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2531         for(x=0; x<b_w; x++){
2532             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2533             if(add) dst[x + y*dst_stride] += v;
2534             else    dst[x + y*dst_stride] -= v;
2535         }
2536     }
2537 #else
2538     if(sliced){
2539         s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2540     }else{
2541         for(y=0; y<b_h; y++){
2542             //FIXME ugly misuse of obmc_stride
2543             const uint8_t *obmc1= obmc + y*obmc_stride;
2544             const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2545             const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2546             const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2547             for(x=0; x<b_w; x++){
2548                 int v=   obmc1[x] * block[3][x + y*src_stride]
2549                         +obmc2[x] * block[2][x + y*src_stride]
2550                         +obmc3[x] * block[1][x + y*src_stride]
2551                         +obmc4[x] * block[0][x + y*src_stride];
2552
2553                 v <<= 8 - LOG2_OBMC_MAX;
2554                 if(FRAC_BITS != 8){
2555                     v >>= 8 - FRAC_BITS;
2556                 }
2557                 if(add){
2558                     v += dst[x + y*dst_stride];
2559                     v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2560                     if(v&(~255)) v= ~(v>>31);
2561                     dst8[x + y*src_stride] = v;
2562                 }else{
2563                     dst[x + y*dst_stride] -= v;
2564                 }
2565             }
2566         }
2567     }
2568 #endif /* 0 */
2569 }
2570
2571 static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, IDWTELEM * old_buffer, int plane_index, int add, int mb_y){
2572     Plane *p= &s->plane[plane_index];
2573     const int mb_w= s->b_width  << s->block_max_depth;
2574     const int mb_h= s->b_height << s->block_max_depth;
2575     int x, y, mb_x;
2576     int block_size = MB_SIZE >> s->block_max_depth;
2577     int block_w    = plane_index ? block_size/2 : block_size;
2578     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2579     int obmc_stride= plane_index ? block_size : 2*block_size;
2580     int ref_stride= s->current_picture.linesize[plane_index];
2581     uint8_t *dst8= s->current_picture.data[plane_index];
2582     int w= p->width;
2583     int h= p->height;
2584
2585     if(s->keyframe || (s->avctx->debug&512)){
2586         if(mb_y==mb_h)
2587             return;
2588
2589         if(add){
2590             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2591 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2592                 IDWTELEM * line = sb->line[y];
2593                 for(x=0; x<w; x++){
2594 //                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2595                     int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2596                     v >>= FRAC_BITS;
2597                     if(v&(~255)) v= ~(v>>31);
2598                     dst8[x + y*ref_stride]= v;
2599                 }
2600             }
2601         }else{
2602             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2603 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2604                 IDWTELEM * line = sb->line[y];
2605                 for(x=0; x<w; x++){
2606                     line[x] -= 128 << FRAC_BITS;
2607 //                    buf[x + y*w]-= 128<<FRAC_BITS;
2608                 }
2609             }
2610         }
2611
2612         return;
2613     }
2614
2615     for(mb_x=0; mb_x<=mb_w; mb_x++){
2616         add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2617                    block_w*mb_x - block_w/2,
2618                    block_w*mb_y - block_w/2,
2619                    block_w, block_w,
2620                    w, h,
2621                    w, ref_stride, obmc_stride,
2622                    mb_x - 1, mb_y - 1,
2623                    add, 0, plane_index);
2624     }
2625 }
2626
2627 static av_always_inline void predict_slice(SnowContext *s, IDWTELEM *buf, int plane_index, int add, int mb_y){
2628     Plane *p= &s->plane[plane_index];
2629     const int mb_w= s->b_width  << s->block_max_depth;
2630     const int mb_h= s->b_height << s->block_max_depth;
2631     int x, y, mb_x;
2632     int block_size = MB_SIZE >> s->block_max_depth;
2633     int block_w    = plane_index ? block_size/2 : block_size;
2634     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2635     const int obmc_stride= plane_index ? block_size : 2*block_size;
2636     int ref_stride= s->current_picture.linesize[plane_index];
2637     uint8_t *dst8= s->current_picture.data[plane_index];
2638     int w= p->width;
2639     int h= p->height;
2640
2641     if(s->keyframe || (s->avctx->debug&512)){
2642         if(mb_y==mb_h)
2643             return;
2644
2645         if(add){
2646             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2647                 for(x=0; x<w; x++){
2648                     int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2649                     v >>= FRAC_BITS;
2650                     if(v&(~255)) v= ~(v>>31);
2651                     dst8[x + y*ref_stride]= v;
2652                 }
2653             }
2654         }else{
2655             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2656                 for(x=0; x<w; x++){
2657                     buf[x + y*w]-= 128<<FRAC_BITS;
2658                 }
2659             }
2660         }
2661
2662         return;
2663     }
2664
2665     for(mb_x=0; mb_x<=mb_w; mb_x++){
2666         add_yblock(s, 0, NULL, buf, dst8, obmc,
2667                    block_w*mb_x - block_w/2,
2668                    block_w*mb_y - block_w/2,
2669                    block_w, block_w,
2670                    w, h,
2671                    w, ref_stride, obmc_stride,
2672                    mb_x - 1, mb_y - 1,
2673                    add, 1, plane_index);
2674     }
2675 }
2676
2677 static av_always_inline void predict_plane(SnowContext *s, IDWTELEM *buf, int plane_index, int add){
2678     const int mb_h= s->b_height << s->block_max_depth;
2679     int mb_y;
2680     for(mb_y=0; mb_y<=mb_h; mb_y++)
2681         predict_slice(s, buf, plane_index, add, mb_y);
2682 }
2683
2684 static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2685     int i, x2, y2;
2686     Plane *p= &s->plane[plane_index];
2687     const int block_size = MB_SIZE >> s->block_max_depth;
2688     const int block_w    = plane_index ? block_size/2 : block_size;
2689     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2690     const int obmc_stride= plane_index ? block_size : 2*block_size;
2691     const int ref_stride= s->current_picture.linesize[plane_index];
2692     uint8_t *src= s-> input_picture.data[plane_index];
2693     IDWTELEM *dst= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4; //FIXME change to unsigned
2694     const int b_stride = s->b_width << s->block_max_depth;
2695     const int w= p->width;
2696     const int h= p->height;
2697     int index= mb_x + mb_y*b_stride;
2698     BlockNode *b= &s->block[index];
2699     BlockNode backup= *b;
2700     int ab=0;
2701     int aa=0;
2702
2703     b->type|= BLOCK_INTRA;
2704     b->color[plane_index]= 0;
2705     memset(dst, 0, obmc_stride*obmc_stride*sizeof(IDWTELEM));
2706
2707     for(i=0; i<4; i++){
2708         int mb_x2= mb_x + (i &1) - 1;
2709         int mb_y2= mb_y + (i>>1) - 1;
2710         int x= block_w*mb_x2 + block_w/2;
2711         int y= block_w*mb_y2 + block_w/2;
2712
2713         add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2714                     x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2715
2716         for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2717             for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2718                 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2719                 int obmc_v= obmc[index];
2720                 int d;
2721                 if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2722                 if(x<0) obmc_v += obmc[index + block_w];
2723                 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2724                 if(x+block_w>w) obmc_v += obmc[index - block_w];
2725                 //FIXME precalculate this or simplify it somehow else
2726
2727                 d = -dst[index] + (1<<(FRAC_BITS-1));
2728                 dst[index] = d;
2729                 ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2730                 aa += obmc_v * obmc_v; //FIXME precalculate this
2731             }
2732         }
2733     }
2734     *b= backup;
2735
2736     return av_clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we should not need clipping
2737 }
2738
2739 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2740     const int b_stride = s->b_width << s->block_max_depth;
2741     const int b_height = s->b_height<< s->block_max_depth;
2742     int index= x + y*b_stride;
2743     const BlockNode *b     = &s->block[index];
2744     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2745     const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2746     const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2747     const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2748     int dmx, dmy;
2749 //  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2750 //  int my_context= av_log2(2*FFABS(left->my - top->my));
2751
2752     if(x<0 || x>=b_stride || y>=b_height)
2753         return 0;
2754 /*
2755 1            0      0
2756 01X          1-2    1
2757 001XX        3-6    2-3
2758 0001XXX      7-14   4-7
2759 00001XXXX   15-30   8-15
2760 */
2761 //FIXME try accurate rate
2762 //FIXME intra and inter predictors if surrounding blocks are not the same type
2763     if(b->type & BLOCK_INTRA){
2764         return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2765                    + av_log2(2*FFABS(left->color[1] - b->color[1]))
2766                    + av_log2(2*FFABS(left->color[2] - b->color[2])));
2767     }else{
2768         pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2769         dmx-= b->mx;
2770         dmy-= b->my;
2771         return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2772                     + av_log2(2*FFABS(dmy))
2773                     + av_log2(2*b->ref));
2774     }
2775 }
2776
2777 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2778     Plane *p= &s->plane[plane_index];
2779     const int block_size = MB_SIZE >> s->block_max_depth;
2780     const int block_w    = plane_index ? block_size/2 : block_size;
2781     const int obmc_stride= plane_index ? block_size : 2*block_size;
2782     const int ref_stride= s->current_picture.linesize[plane_index];
2783     uint8_t *dst= s->current_picture.data[plane_index];
2784     uint8_t *src= s->  input_picture.data[plane_index];
2785     IDWTELEM *pred= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2786     uint8_t *cur = s->scratchbuf;
2787     uint8_t tmp[ref_stride*(2*MB_SIZE+HTAPS_MAX-1)];
2788     const int b_stride = s->b_width << s->block_max_depth;
2789     const int b_height = s->b_height<< s->block_max_depth;
2790     const int w= p->width;
2791     const int h= p->height;
2792     int distortion;
2793     int rate= 0;
2794     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2795     int sx= block_w*mb_x - block_w/2;
2796     int sy= block_w*mb_y - block_w/2;
2797     int x0= FFMAX(0,-sx);
2798     int y0= FFMAX(0,-sy);
2799     int x1= FFMIN(block_w*2, w-sx);
2800     int y1= FFMIN(block_w*2, h-sy);
2801     int i,x,y;
2802
2803     pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2804
2805     for(y=y0; y<y1; y++){
2806         const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2807         const IDWTELEM *pred1 = pred + y*obmc_stride;
2808         uint8_t *cur1 = cur + y*ref_stride;
2809         uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2810         for(x=x0; x<x1; x++){
2811 #if FRAC_BITS >= LOG2_OBMC_MAX
2812             int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2813 #else
2814             int v = (cur1[x] * obmc1[x] + (1<<(LOG2_OBMC_MAX - FRAC_BITS-1))) >> (LOG2_OBMC_MAX - FRAC_BITS);
2815 #endif
2816             v = (v + pred1[x]) >> FRAC_BITS;
2817             if(v&(~255)) v= ~(v>>31);
2818             dst1[x] = v;
2819         }
2820     }
2821
2822     /* copy the regions where obmc[] = (uint8_t)256 */
2823     if(LOG2_OBMC_MAX == 8
2824         && (mb_x == 0 || mb_x == b_stride-1)
2825         && (mb_y == 0 || mb_y == b_height-1)){
2826         if(mb_x == 0)
2827             x1 = block_w;
2828         else
2829             x0 = block_w;
2830         if(mb_y == 0)
2831             y1 = block_w;
2832         else
2833             y0 = block_w;
2834         for(y=y0; y<y1; y++)
2835             memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2836     }
2837
2838     if(block_w==16){
2839         /* FIXME rearrange dsputil to fit 32x32 cmp functions */
2840         /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
2841         /* FIXME cmps overlap but do not cover the wavelet's whole support.
2842          * So improving the score of one block is not strictly guaranteed
2843          * to improve the score of the whole frame, thus iterative motion
2844          * estimation does not always converge. */
2845         if(s->avctx->me_cmp == FF_CMP_W97)
2846             distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
2847         else if(s->avctx->me_cmp == FF_CMP_W53)
2848             distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
2849         else{
2850             distortion = 0;
2851             for(i=0; i<4; i++){
2852                 int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
2853                 distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
2854             }
2855         }
2856     }else{
2857         assert(block_w==8);
2858         distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
2859     }
2860
2861     if(plane_index==0){
2862         for(i=0; i<4; i++){
2863 /* ..RRr
2864  * .RXx.
2865  * rxx..
2866  */
2867             rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
2868         }
2869         if(mb_x == b_stride-2)
2870             rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
2871     }
2872     return distortion + rate*penalty_factor;
2873 }
2874
2875 static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
2876     int i, y2;
2877     Plane *p= &s->plane[plane_index];
2878     const int block_size = MB_SIZE >> s->block_max_depth;
2879     const int block_w    = plane_index ? block_size/2 : block_size;
2880     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2881     const int obmc_stride= plane_index ? block_size : 2*block_size;
2882     const int ref_stride= s->current_picture.linesize[plane_index];
2883     uint8_t *dst= s->current_picture.data[plane_index];
2884     uint8_t *src= s-> input_picture.data[plane_index];
2885     //FIXME zero_dst is const but add_yblock changes dst if add is 0 (this is never the case for dst=zero_dst
2886     // const has only been removed from zero_dst to suppress a warning
2887     static IDWTELEM zero_dst[4096]; //FIXME
2888     const int b_stride = s->b_width << s->block_max_depth;
2889     const int w= p->width;
2890     const int h= p->height;
2891     int distortion= 0;
2892     int rate= 0;
2893     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2894
2895     for(i=0; i<9; i++){
2896         int mb_x2= mb_x + (i%3) - 1;
2897         int mb_y2= mb_y + (i/3) - 1;
2898         int x= block_w*mb_x2 + block_w/2;
2899         int y= block_w*mb_y2 + block_w/2;
2900
2901         add_yblock(s, 0, NULL, zero_dst, dst, obmc,
2902                    x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
2903
2904         //FIXME find a cleaner/simpler way to skip the outside stuff
2905         for(y2= y; y2<0; y2++)
2906             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
2907         for(y2= h; y2<y+block_w; y2++)
2908             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
2909         if(x<0){
2910             for(y2= y; y2<y+block_w; y2++)
2911                 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
2912         }
2913         if(x+block_w > w){
2914             for(y2= y; y2<y+block_w; y2++)
2915                 memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
2916         }
2917
2918         assert(block_w== 8 || block_w==16);
2919         distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
2920     }
2921
2922     if(plane_index==0){
2923         BlockNode *b= &s->block[mb_x+mb_y*b_stride];
2924         int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
2925
2926 /* ..RRRr
2927  * .RXXx.
2928  * .RXXx.
2929  * rxxx.
2930  */
2931         if(merged)
2932             rate = get_block_bits(s, mb_x, mb_y, 2);
2933         for(i=merged?4:0; i<9; i++){
2934             static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
2935             rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
2936         }
2937     }
2938     return distortion + rate*penalty_factor;
2939 }
2940
2941 static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
2942     const int b_stride= s->b_width << s->block_max_depth;
2943     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
2944     BlockNode backup= *block;
2945     int rd, index, value;
2946
2947     assert(mb_x>=0 && mb_y>=0);
2948     assert(mb_x<b_stride);
2949
2950     if(intra){
2951         block->color[0] = p[0];
2952         block->color[1] = p[1];
2953         block->color[2] = p[2];
2954         block->type |= BLOCK_INTRA;
2955     }else{
2956         index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
2957         value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
2958         if(s->me_cache[index] == value)
2959             return 0;
2960         s->me_cache[index]= value;
2961
2962         block->mx= p[0];
2963         block->my= p[1];
2964         block->type &= ~BLOCK_INTRA;
2965     }
2966
2967     rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
2968
2969 //FIXME chroma
2970     if(rd < *best_rd){
2971         *best_rd= rd;
2972         return 1;
2973     }else{
2974         *block= backup;
2975         return 0;
2976     }
2977 }
2978
2979 /* special case for int[2] args we discard afterwards,
2980  * fixes compilation problem with gcc 2.95 */
2981 static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
2982     int p[2] = {p0, p1};
2983     return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
2984 }
2985
2986 static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
2987     const int b_stride= s->b_width << s->block_max_depth;
2988     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
2989     BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
2990     int rd, index, value;
2991
2992     assert(mb_x>=0 && mb_y>=0);
2993     assert(mb_x<b_stride);
2994     assert(((mb_x|mb_y)&1) == 0);
2995
2996     index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
2997     value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
2998     if(s->me_cache[index] == value)
2999         return 0;
3000     s->me_cache[index]= value;
3001
3002     block->mx= p0;
3003     block->my= p1;
3004     block->ref= ref;
3005     block->type &= ~BLOCK_INTRA;
3006     block[1]= block[b_stride]= block[b_stride+1]= *block;
3007
3008     rd= get_4block_rd(s, mb_x, mb_y, 0);
3009
3010 //FIXME chroma
3011     if(rd < *best_rd){
3012         *best_rd= rd;
3013         return 1;
3014     }else{
3015         block[0]= backup[0];
3016         block[1]= backup[1];
3017         block[b_stride]= backup[2];
3018         block[b_stride+1]= backup[3];
3019         return 0;
3020     }
3021 }
3022
3023 static void iterative_me(SnowContext *s){
3024     int pass, mb_x, mb_y;
3025     const int b_width = s->b_width  << s->block_max_depth;
3026     const int b_height= s->b_height << s->block_max_depth;
3027     const int b_stride= b_width;
3028     int color[3];
3029
3030     {
3031         RangeCoder r = s->c;
3032         uint8_t state[sizeof(s->block_state)];
3033         memcpy(state, s->block_state, sizeof(s->block_state));
3034         for(mb_y= 0; mb_y<s->b_height; mb_y++)
3035             for(mb_x= 0; mb_x<s->b_width; mb_x++)
3036                 encode_q_branch(s, 0, mb_x, mb_y);
3037         s->c = r;
3038         memcpy(s->block_state, state, sizeof(s->block_state));
3039     }
3040
3041     for(pass=0; pass<25; pass++){
3042         int change= 0;
3043
3044         for(mb_y= 0; mb_y<b_height; mb_y++){
3045             for(mb_x= 0; mb_x<b_width; mb_x++){
3046                 int dia_change, i, j, ref;
3047                 int best_rd= INT_MAX, ref_rd;
3048                 BlockNode backup, ref_b;
3049                 const int index= mb_x + mb_y * b_stride;
3050                 BlockNode *block= &s->block[index];
3051                 BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3052                 BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3053                 BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3054                 BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3055                 BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3056                 BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3057                 BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3058                 BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3059                 const int b_w= (MB_SIZE >> s->block_max_depth);
3060                 uint8_t obmc_edged[b_w*2][b_w*2];
3061
3062                 if(pass && (block->type & BLOCK_OPT))
3063                     continue;
3064                 block->type |= BLOCK_OPT;
3065
3066                 backup= *block;
3067
3068                 if(!s->me_cache_generation)
3069                     memset(s->me_cache, 0, sizeof(s->me_cache));
3070                 s->me_cache_generation += 1<<22;
3071
3072                 //FIXME precalculate
3073                 {
3074                     int x, y;
3075                     memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3076                     if(mb_x==0)
3077                         for(y=0; y<b_w*2; y++)
3078                             memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3079                     if(mb_x==b_stride-1)
3080                         for(y=0; y<b_w*2; y++)
3081                             memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3082                     if(mb_y==0){
3083                         for(x=0; x<b_w*2; x++)
3084                             obmc_edged[0][x] += obmc_edged[b_w-1][x];
3085                         for(y=1; y<b_w; y++)
3086                             memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3087                     }
3088                     if(mb_y==b_height-1){
3089                         for(x=0; x<b_w*2; x++)
3090                             obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3091                         for(y=b_w; y<b_w*2-1; y++)
3092                             memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3093                     }
3094                 }
3095
3096                 //skip stuff outside the picture
3097                 if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1){
3098                     uint8_t *src= s->  input_picture.data[0];
3099                     uint8_t *dst= s->current_picture.data[0];
3100                     const int stride= s->current_picture.linesize[0];
3101                     const int block_w= MB_SIZE >> s->block_max_depth;
3102                     const int sx= block_w*mb_x - block_w/2;
3103                     const int sy= block_w*mb_y - block_w/2;
3104                     const int w= s->plane[0].width;
3105                     const int h= s->plane[0].height;
3106                     int y;
3107
3108                     for(y=sy; y<0; y++)
3109                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3110                     for(y=h; y<sy+block_w*2; y++)
3111                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3112                     if(sx<0){
3113                         for(y=sy; y<sy+block_w*2; y++)
3114                             memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3115                     }
3116                     if(sx+block_w*2 > w){
3117                         for(y=sy; y<sy+block_w*2; y++)
3118                             memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3119                     }
3120                 }
3121
3122                 // intra(black) = neighbors' contribution to the current block
3123                 for(i=0; i<3; i++)
3124                     color[i]= get_dc(s, mb_x, mb_y, i);
3125
3126                 // get previous score (cannot be cached due to OBMC)
3127                 if(pass > 0 && (block->type&BLOCK_INTRA)){
3128                     int color0[3]= {block->color[0], block->color[1], block->color[2]};
3129                     check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3130                 }else
3131                     check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3132
3133                 ref_b= *block;
3134                 ref_rd= best_rd;
3135                 for(ref=0; ref < s->ref_frames; ref++){
3136                     int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3137                     if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3138                         continue;
3139                     block->ref= ref;
3140                     best_rd= INT_MAX;
3141
3142                     check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3143                     check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3144                     if(tb)
3145                         check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3146                     if(lb)
3147                         check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3148                     if(rb)
3149                         check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3150                     if(bb)
3151                         check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3152
3153                     /* fullpel ME */
3154                     //FIXME avoid subpel interpolation / round to nearest integer
3155                     do{
3156                         dia_change=0;
3157                         for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3158                             for(j=0; j<i; j++){
3159                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3160                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3161                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3162                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3163                             }
3164                         }
3165                     }while(dia_change);
3166                     /* subpel ME */
3167                     do{
3168                         static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3169                         dia_change=0;
3170                         for(i=0; i<8; i++)
3171                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3172                     }while(dia_change);
3173                     //FIXME or try the standard 2 pass qpel or similar
3174
3175                     mvr[0][0]= block->mx;
3176                     mvr[0][1]= block->my;
3177                     if(ref_rd > best_rd){
3178                         ref_rd= best_rd;
3179                         ref_b= *block;
3180                     }
3181                 }
3182                 best_rd= ref_rd;
3183                 *block= ref_b;
3184 #if 1
3185                 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3186                 //FIXME RD style color selection
3187 #endif
3188                 if(!same_block(block, &backup)){
3189                     if(tb ) tb ->type &= ~BLOCK_OPT;
3190                     if(lb ) lb ->type &= ~BLOCK_OPT;
3191                     if(rb ) rb ->type &= ~BLOCK_OPT;
3192                     if(bb ) bb ->type &= ~BLOCK_OPT;
3193                     if(tlb) tlb->type &= ~BLOCK_OPT;
3194                     if(trb) trb->type &= ~BLOCK_OPT;
3195                     if(blb) blb->type &= ~BLOCK_OPT;
3196                     if(brb) brb->type &= ~BLOCK_OPT;
3197                     change ++;
3198                 }
3199             }
3200         }
3201         av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3202         if(!change)
3203             break;
3204     }
3205
3206     if(s->block_max_depth == 1){
3207         int change= 0;
3208         for(mb_y= 0; mb_y<b_height; mb_y+=2){
3209             for(mb_x= 0; mb_x<b_width; mb_x+=2){
3210                 int i;
3211                 int best_rd, init_rd;
3212                 const int index= mb_x + mb_y * b_stride;
3213                 BlockNode *b[4];
3214
3215                 b[0]= &s->block[index];
3216                 b[1]= b[0]+1;
3217                 b[2]= b[0]+b_stride;
3218                 b[3]= b[2]+1;
3219                 if(same_block(b[0], b[1]) &&
3220                    same_block(b[0], b[2]) &&
3221                    same_block(b[0], b[3]))
3222                     continue;
3223
3224                 if(!s->me_cache_generation)
3225                     memset(s->me_cache, 0, sizeof(s->me_cache));
3226                 s->me_cache_generation += 1<<22;
3227
3228                 init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3229
3230                 //FIXME more multiref search?
3231                 check_4block_inter(s, mb_x, mb_y,
3232                                    (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3233                                    (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3234
3235                 for(i=0; i<4; i++)
3236                     if(!(b[i]->type&BLOCK_INTRA))
3237                         check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3238
3239                 if(init_rd != best_rd)
3240                     change++;
3241             }
3242         }
3243         av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3244     }
3245 }
3246
3247 static void quantize(SnowContext *s, SubBand *b, IDWTELEM *dst, DWTELEM *src, int stride, int bias){
3248     const int w= b->width;
3249     const int h= b->height;
3250     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3251     const int qmul= qexp[qlog&(QROOT-1)]<<((qlog>>QSHIFT) + ENCODER_EXTRA_BITS);
3252     int x,y, thres1, thres2;
3253
3254     if(s->qlog == LOSSLESS_QLOG){
3255         for(y=0; y<h; y++)
3256             for(x=0; x<w; x++)
3257                 dst[x + y*stride]= src[x + y*stride];
3258         return;
3259     }
3260
3261     bias= bias ? 0 : (3*qmul)>>3;
3262     thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3263     thres2= 2*thres1;
3264
3265     if(!bias){
3266         for(y=0; y<h; y++){
3267             for(x=0; x<w; x++){
3268                 int i= src[x + y*stride];
3269
3270                 if((unsigned)(i+thres1) > thres2){
3271                     if(i>=0){
3272                         i<<= QEXPSHIFT;
3273                         i/= qmul; //FIXME optimize
3274                         dst[x + y*stride]=  i;
3275                     }else{
3276                         i= -i;
3277                         i<<= QEXPSHIFT;
3278                         i/= qmul; //FIXME optimize
3279                         dst[x + y*stride]= -i;
3280                     }
3281                 }else
3282                     dst[x + y*stride]= 0;
3283             }
3284         }
3285     }else{
3286         for(y=0; y<h; y++){
3287             for(x=0; x<w; x++){
3288                 int i= src[x + y*stride];
3289
3290                 if((unsigned)(i+thres1) > thres2){
3291                     if(i>=0){
3292                         i<<= QEXPSHIFT;
3293                         i= (i + bias) / qmul; //FIXME optimize
3294                         dst[x + y*stride]=  i;
3295                     }else{
3296                         i= -i;
3297                         i<<= QEXPSHIFT;
3298                         i= (i + bias) / qmul; //FIXME optimize
3299                         dst[x + y*stride]= -i;
3300                     }
3301                 }else
3302                     dst[x + y*stride]= 0;
3303             }
3304         }
3305     }
3306 }
3307
3308 static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int start_y, int end_y){
3309     const int w= b->width;
3310     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3311     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3312     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3313     int x,y;
3314
3315     if(s->qlog == LOSSLESS_QLOG) return;
3316
3317     for(y=start_y; y<end_y; y++){
3318 //        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3319         IDWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3320         for(x=0; x<w; x++){
3321             int i= line[x];
3322             if(i<0){
3323                 line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3324             }else if(i>0){
3325                 line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3326             }
3327         }
3328     }
3329 }
3330
3331 static void dequantize(SnowContext *s, SubBand *b, IDWTELEM *src, int stride){
3332     const int w= b->width;
3333     const int h= b->height;
3334     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3335     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3336     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3337     int x,y;
3338
3339     if(s->qlog == LOSSLESS_QLOG) return;
3340
3341     for(y=0; y<h; y++){
3342         for(x=0; x<w; x++){
3343             int i= src[x + y*stride];
3344             if(i<0){
3345                 src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3346             }else if(i>0){
3347                 src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3348             }
3349         }
3350     }
3351 }
3352
3353 static void decorrelate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
3354     const int w= b->width;
3355     const int h= b->height;
3356     int x,y;
3357
3358     for(y=h-1; y>=0; y--){
3359         for(x=w-1; x>=0; x--){
3360             int i= x + y*stride;
3361
3362             if(x){
3363                 if(use_median){
3364                     if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3365                     else  src[i] -= src[i - 1];
3366                 }else{
3367                     if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3368                     else  src[i] -= src[i - 1];
3369                 }
3370             }else{
3371                 if(y) src[i] -= src[i - stride];
3372             }
3373         }
3374     }
3375 }
3376
3377 static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3378     const int w= b->width;
3379     int x,y;
3380
3381     IDWTELEM * line=0; // silence silly "could be used without having been initialized" warning
3382     IDWTELEM * prev;
3383
3384     if (start_y != 0)
3385         line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3386
3387     for(y=start_y; y<end_y; y++){
3388         prev = line;
3389 //        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3390         line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3391         for(x=0; x<w; x++){
3392             if(x){
3393                 if(use_median){
3394                     if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3395                     else  line[x] += line[x - 1];
3396                 }else{
3397                     if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3398                     else  line[x] += line[x - 1];
3399                 }
3400             }else{
3401                 if(y) line[x] += prev[x];
3402             }
3403         }
3404     }
3405 }
3406
3407 static void correlate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
3408     const int w= b->width;
3409     const int h= b->height;
3410     int x,y;
3411
3412     for(y=0; y<h; y++){
3413         for(x=0; x<w; x++){
3414             int i= x + y*stride;
3415
3416             if(x){
3417                 if(use_median){
3418                     if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3419                     else  src[i] += src[i - 1];
3420                 }else{
3421                     if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3422                     else  src[i] += src[i - 1];
3423                 }
3424             }else{
3425                 if(y) src[i] += src[i - stride];
3426             }
3427         }
3428     }
3429 }
3430
3431 static void encode_qlogs(SnowContext *s){
3432     int plane_index, level, orientation;
3433
3434     for(plane_index=0; plane_index<2; plane_index++){
3435         for(level=0; level<s->spatial_decomposition_count; level++){
3436             for(orientation=level ? 1:0; orientation<4; orientation++){
3437                 if(orientation==2) continue;
3438                 put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3439             }
3440         }
3441     }
3442 }
3443
3444 static void encode_header(SnowContext *s){
3445     int plane_index, i;
3446     uint8_t kstate[32];
3447
3448     memset(kstate, MID_STATE, sizeof(kstate));
3449
3450     put_rac(&s->c, kstate, s->keyframe);
3451     if(s->keyframe || s->always_reset){
3452         reset_contexts(s);
3453         s->last_spatial_decomposition_type=
3454         s->last_qlog=
3455         s->last_qbias=
3456         s->last_mv_scale=
3457         s->last_block_max_depth= 0;
3458         for(plane_index=0; plane_index<2; plane_index++){
3459             Plane *p= &s->plane[plane_index];
3460             p->last_htaps=0;
3461             p->last_diag_mc=0;
3462             memset(p->last_hcoeff, 0, sizeof(p->last_hcoeff));
3463         }
3464     }
3465     if(s->keyframe){
3466         put_symbol(&s->c, s->header_state, s->version, 0);
3467         put_rac(&s->c, s->header_state, s->always_reset);
3468         put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3469         put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3470         put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3471         put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3472         put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3473         put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3474         put_rac(&s->c, s->header_state, s->spatial_scalability);
3475 //        put_rac(&s->c, s->header_state, s->rate_scalability);
3476         put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3477
3478         encode_qlogs(s);
3479     }
3480
3481     if(!s->keyframe){
3482         int update_mc=0;
3483         for(plane_index=0; plane_index<2; plane_index++){
3484             Plane *p= &s->plane[plane_index];
3485             update_mc |= p->last_htaps   != p->htaps;
3486             update_mc |= p->last_diag_mc != p->diag_mc;
3487             update_mc |= !!memcmp(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
3488         }
3489         put_rac(&s->c, s->header_state, update_mc);
3490         if(update_mc){
3491             for(plane_index=0; plane_index<2; plane_index++){
3492                 Plane *p= &s->plane[plane_index];
3493                 put_rac(&s->c, s->header_state, p->diag_mc);
3494                 put_symbol(&s->c, s->header_state, p->htaps/2-1, 0);
3495                 for(i= p->htaps/2; i; i--)
3496                     put_symbol(&s->c, s->header_state, FFABS(p->hcoeff[i]), 0);
3497             }
3498         }
3499         if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
3500             put_rac(&s->c, s->header_state, 1);
3501             put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3502             encode_qlogs(s);
3503         }else
3504             put_rac(&s->c, s->header_state, 0);
3505     }
3506
3507     put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3508     put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3509     put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3510     put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3511     put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3512
3513 }
3514
3515 static void update_last_header_values(SnowContext *s){
3516     int plane_index;
3517
3518     if(!s->keyframe){
3519         for(plane_index=0; plane_index<2; plane_index++){
3520             Plane *p= &s->plane[plane_index];
3521             p->last_diag_mc= p->diag_mc;
3522             p->last_htaps  = p->htaps;
3523             memcpy(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
3524         }
3525     }
3526
3527     s->last_spatial_decomposition_type  = s->spatial_decomposition_type;
3528     s->last_qlog                        = s->qlog;
3529     s->last_qbias                       = s->qbias;
3530     s->last_mv_scale                    = s->mv_scale;
3531     s->last_block_max_depth             = s->block_max_depth;
3532     s->last_spatial_decomposition_count = s->spatial_decomposition_count;
3533 }
3534
3535 static void decode_qlogs(SnowContext *s){
3536     int plane_index, level, orientation;
3537
3538     for(plane_index=0; plane_index<3; plane_index++){
3539         for(level=0; level<s->spatial_decomposition_count; level++){
3540             for(orientation=level ? 1:0; orientation<4; orientation++){
3541                 int q;
3542                 if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3543                 else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3544                 else                    q= get_symbol(&s->c, s->header_state, 1);
3545                 s->plane[plane_index].band[level][orientation].qlog= q;
3546             }
3547         }
3548     }
3549 }
3550
3551 #define GET_S(dst, check) \
3552     tmp= get_symbol(&s->c, s->header_state, 0);\
3553     if(!(check)){\
3554         av_log(s->avctx, AV_LOG_ERROR, "Error " #dst " is %d\n", tmp);\
3555         return -1;\
3556     }\
3557     dst= tmp;
3558
3559 static int decode_header(SnowContext *s){
3560     int plane_index, tmp;
3561     uint8_t kstate[32];
3562
3563     memset(kstate, MID_STATE, sizeof(kstate));
3564
3565     s->keyframe= get_rac(&s->c, kstate);
3566     if(s->keyframe || s->always_reset){
3567         reset_contexts(s);
3568         s->spatial_decomposition_type=
3569         s->qlog=
3570         s->qbias=
3571         s->mv_scale=
3572         s->block_max_depth= 0;
3573     }
3574     if(s->keyframe){
3575         GET_S(s->version, tmp <= 0U)
3576         s->always_reset= get_rac(&s->c, s->header_state);
3577         s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3578         s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3579         GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
3580         s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3581         s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3582         s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3583         s->spatial_scalability= get_rac(&s->c, s->header_state);
3584 //        s->rate_scalability= get_rac(&s->c, s->header_state);
3585         GET_S(s->max_ref_frames, tmp < (unsigned)MAX_REF_FRAMES)
3586         s->max_ref_frames++;
3587
3588         decode_qlogs(s);
3589     }
3590
3591     if(!s->keyframe){
3592         if(get_rac(&s->c, s->header_state)){
3593             for(plane_index=0; plane_index<2; plane_index++){
3594                 int htaps, i, sum=0;
3595                 Plane *p= &s->plane[plane_index];
3596                 p->diag_mc= get_rac(&s->c, s->header_state);
3597                 htaps= get_symbol(&s->c, s->header_state, 0)*2 + 2;
3598                 if((unsigned)htaps > HTAPS_MAX || htaps==0)
3599                     return -1;
3600                 p->htaps= htaps;
3601                 for(i= htaps/2; i; i--){
3602                     p->hcoeff[i]= get_symbol(&s->c, s->header_state, 0) * (1-2*(i&1));
3603                     sum += p->hcoeff[i];
3604                 }
3605                 p->hcoeff[0]= 32-sum;
3606             }
3607             s->plane[2].diag_mc= s->plane[1].diag_mc;
3608             s->plane[2].htaps  = s->plane[1].htaps;
3609             memcpy(s->plane[2].hcoeff, s->plane[1].hcoeff, sizeof(s->plane[1].hcoeff));
3610         }
3611         if(get_rac(&s->c, s->header_state)){
3612             GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
3613             decode_qlogs(s);
3614         }
3615     }
3616
3617     s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
3618     if(s->spatial_decomposition_type > 1U){
3619         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3620         return -1;
3621     }
3622     if(FFMIN(s->avctx-> width>>s->chroma_h_shift,
3623              s->avctx->height>>s->chroma_v_shift) >> (s->spatial_decomposition_count-1) <= 0){
3624         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_count %d too large for size", s->spatial_decomposition_count);
3625         return -1;
3626     }
3627
3628     s->qlog           += get_symbol(&s->c, s->header_state, 1);
3629     s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
3630     s->qbias          += get_symbol(&s->c, s->header_state, 1);
3631     s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
3632     if(s->block_max_depth > 1 || s->block_max_depth < 0){
3633         av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3634         s->block_max_depth= 0;
3635         return -1;
3636     }
3637
3638     return 0;
3639 }
3640
3641 static void init_qexp(void){
3642     int i;
3643     double v=128;
3644
3645     for(i=0; i<QROOT; i++){
3646         qexp[i]= lrintf(v);
3647         v *= pow(2, 1.0 / QROOT);
3648     }
3649 }
3650
3651 static av_cold int common_init(AVCodecContext *avctx){
3652     SnowContext *s = avctx->priv_data;
3653     int width, height;
3654     int i, j;
3655
3656     s->avctx= avctx;
3657     s->max_ref_frames=1; //just make sure its not an invalid value in case of no initial keyframe
3658
3659     dsputil_init(&s->dsp, avctx);
3660
3661 #define mcf(dx,dy)\
3662     s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3663     s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3664         s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3665     s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3666     s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3667         s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3668
3669     mcf( 0, 0)
3670     mcf( 4, 0)
3671     mcf( 8, 0)
3672     mcf(12, 0)
3673     mcf( 0, 4)
3674     mcf( 4, 4)
3675     mcf( 8, 4)
3676     mcf(12, 4)
3677     mcf( 0, 8)
3678     mcf( 4, 8)
3679     mcf( 8, 8)
3680     mcf(12, 8)
3681     mcf( 0,12)
3682     mcf( 4,12)
3683     mcf( 8,12)
3684     mcf(12,12)
3685
3686 #define mcfh(dx,dy)\
3687     s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3688     s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3689         mc_block_hpel ## dx ## dy ## 16;\
3690     s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3691     s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3692         mc_block_hpel ## dx ## dy ## 8;
3693
3694     mcfh(0, 0)
3695     mcfh(8, 0)
3696     mcfh(0, 8)
3697     mcfh(8, 8)
3698
3699     if(!qexp[0])
3700         init_qexp();
3701
3702 //    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3703
3704     width= s->avctx->width;
3705     height= s->avctx->height;
3706
3707     s->spatial_idwt_buffer= av_mallocz(width*height*sizeof(IDWTELEM));
3708     s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM)); //FIXME this does not belong here
3709
3710     for(i=0; i<MAX_REF_FRAMES; i++)
3711         for(j=0; j<MAX_REF_FRAMES; j++)
3712             scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3713
3714     s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3715     s->scratchbuf = av_malloc(s->mconly_picture.linesize[0]*7*MB_SIZE);
3716
3717     return 0;
3718 }
3719
3720 static int common_init_after_header(AVCodecContext *avctx){
3721     SnowContext *s = avctx->priv_data;
3722     int plane_index, level, orientation;
3723
3724     for(plane_index=0; plane_index<3; plane_index++){
3725         int w= s->avctx->width;
3726         int h= s->avctx->height;
3727
3728         if(plane_index){
3729             w>>= s->chroma_h_shift;
3730             h>>= s->chroma_v_shift;
3731         }
3732         s->plane[plane_index].width = w;
3733         s->plane[plane_index].height= h;
3734
3735         for(level=s->spatial_decomposition_count-1; level>=0; level--){
3736             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3737                 SubBand *b= &s->plane[plane_index].band[level][orientation];
3738
3739                 b->buf= s->spatial_dwt_buffer;
3740                 b->level= level;
3741                 b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3742                 b->width = (w + !(orientation&1))>>1;
3743                 b->height= (h + !(orientation>1))>>1;
3744
3745                 b->stride_line = 1 << (s->spatial_decomposition_count - level);
3746                 b->buf_x_offset = 0;
3747                 b->buf_y_offset = 0;
3748
3749                 if(orientation&1){
3750                     b->buf += (w+1)>>1;
3751                     b->buf_x_offset = (w+1)>>1;
3752                 }
3753                 if(orientation>1){
3754                     b->buf += b->stride>>1;
3755                     b->buf_y_offset = b->stride_line >> 1;
3756                 }
3757                 b->ibuf= s->spatial_idwt_buffer + (b->buf - s->spatial_dwt_buffer);
3758
3759                 if(level)
3760                     b->parent= &s->plane[plane_index].band[level-1][orientation];
3761                 //FIXME avoid this realloc
3762                 av_freep(&b->x_coeff);
3763                 b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3764             }
3765             w= (w+1)>>1;
3766             h= (h+1)>>1;
3767         }
3768     }
3769
3770     return 0;
3771 }
3772
3773 static int qscale2qlog(int qscale){
3774     return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3775            + 61*QROOT/8; //<64 >60
3776 }
3777
3778 static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3779 {
3780     /* Estimate the frame's complexity as a sum of weighted dwt coefficients.
3781      * FIXME we know exact mv bits at this point,
3782      * but ratecontrol isn't set up to include them. */
3783     uint32_t coef_sum= 0;
3784     int level, orientation, delta_qlog;
3785
3786     for(level=0; level<s->spatial_decomposition_count; level++){
3787         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3788             SubBand *b= &s->plane[0].band[level][orientation];
3789             IDWTELEM *buf= b->ibuf;
3790             const int w= b->width;
3791             const int h= b->height;
3792             const int stride= b->stride;
3793             const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
3794             const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3795             const int qdiv= (1<<16)/qmul;
3796             int x, y;
3797             //FIXME this is ugly
3798             for(y=0; y<h; y++)
3799                 for(x=0; x<w; x++)
3800                     buf[x+y*stride]= b->buf[x+y*stride];
3801             if(orientation==0)
3802                 decorrelate(s, b, buf, stride, 1, 0);
3803             for(y=0; y<h; y++)
3804                 for(x=0; x<w; x++)
3805                     coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3806         }
3807     }
3808
3809     /* ugly, ratecontrol just takes a sqrt again */
3810     coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3811     assert(coef_sum < INT_MAX);
3812
3813     if(pict->pict_type == FF_I_TYPE){
3814         s->m.current_picture.mb_var_sum= coef_sum;
3815         s->m.current_picture.mc_mb_var_sum= 0;
3816     }else{
3817         s->m.current_picture.mc_mb_var_sum= coef_sum;
3818         s->m.current_picture.mb_var_sum= 0;
3819     }
3820
3821     pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3822     if (pict->quality < 0)
3823         return INT_MIN;
3824     s->lambda= pict->quality * 3/2;
3825     delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3826     s->qlog+= delta_qlog;
3827     return delta_qlog;
3828 }
3829
3830 static void calculate_visual_weight(SnowContext *s, Plane *p){
3831     int width = p->width;
3832     int height= p->height;
3833     int level, orientation, x, y;
3834
3835     for(level=0; level<s->spatial_decomposition_count; level++){
3836         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3837             SubBand *b= &p->band[level][orientation];
3838             IDWTELEM *ibuf= b->ibuf;
3839             int64_t error=0;
3840
3841             memset(s->spatial_idwt_buffer, 0, sizeof(*s->spatial_idwt_buffer)*width*height);
3842             ibuf[b->width/2 + b->height/2*b->stride]= 256*16;
3843             ff_spatial_idwt(s->spatial_idwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3844             for(y=0; y<height; y++){
3845                 for(x=0; x<width; x++){
3846                     int64_t d= s->spatial_idwt_buffer[x + y*width]*16;
3847                     error += d*d;
3848                 }
3849             }
3850
3851             b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3852         }
3853     }
3854 }
3855
3856 #define QUANTIZE2 0
3857
3858 #if QUANTIZE2==1
3859 #define Q2_STEP 8
3860
3861 static void find_sse(SnowContext *s, Plane *p, int *score, int score_stride, IDWTELEM *r0, IDWTELEM *r1, int level, int orientation){
3862     SubBand *b= &p->band[level][orientation];
3863     int x, y;
3864     int xo=0;
3865     int yo=0;
3866     int step= 1 << (s->spatial_decomposition_count - level);
3867
3868     if(orientation&1)
3869         xo= step>>1;
3870     if(orientation&2)
3871         yo= step>>1;
3872
3873     //FIXME bias for nonzero ?
3874     //FIXME optimize
3875     memset(score, 0, sizeof(*score)*score_stride*((p->height + Q2_STEP-1)/Q2_STEP));
3876     for(y=0; y<p->height; y++){
3877         for(x=0; x<p->width; x++){
3878             int sx= (x-xo + step/2) / step / Q2_STEP;
3879             int sy= (y-yo + step/2) / step / Q2_STEP;
3880             int v= r0[x + y*p->width] - r1[x + y*p->width];
3881             assert(sx>=0 && sy>=0 && sx < score_stride);
3882             v= ((v+8)>>4)<<4;
3883             score[sx + sy*score_stride] += v*v;
3884             assert(score[sx + sy*score_stride] >= 0);
3885         }
3886     }
3887 }
3888
3889 static void dequantize_all(SnowContext *s, Plane *p, IDWTELEM *buffer, int width, int height){
3890     int level, orientation;
3891
3892     for(level=0; level<s->spatial_decomposition_count; level++){
3893         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3894             SubBand *b= &p->band[level][orientation];
3895             IDWTELEM *dst= buffer + (b->ibuf - s->spatial_idwt_buffer);
3896
3897             dequantize(s, b, dst, b->stride);
3898         }
3899     }
3900 }
3901
3902 static void dwt_quantize(SnowContext *s, Plane *p, DWTELEM *buffer, int width, int height, int stride, int type){
3903     int level, orientation, ys, xs, x, y, pass;
3904     IDWTELEM best_dequant[height * stride];
3905     IDWTELEM idwt2_buffer[height * stride];
3906     const int score_stride= (width + 10)/Q2_STEP;
3907     int best_score[(width + 10)/Q2_STEP * (height + 10)/Q2_STEP]; //FIXME size
3908     int score[(width + 10)/Q2_STEP * (height + 10)/Q2_STEP]; //FIXME size
3909     int threshold= (s->m.lambda * s->m.lambda) >> 6;
3910
3911     //FIXME pass the copy cleanly ?
3912
3913 //    memcpy(dwt_buffer, buffer, height * stride * sizeof(DWTELEM));
3914     ff_spatial_dwt(buffer, width, height, stride, type, s->spatial_decomposition_count);
3915
3916     for(level=0; level<s->spatial_decomposition_count; level++){
3917         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3918             SubBand *b= &p->band[level][orientation];
3919             IDWTELEM *dst= best_dequant + (b->ibuf - s->spatial_idwt_buffer);
3920              DWTELEM *src=       buffer + (b-> buf - s->spatial_dwt_buffer);
3921             assert(src == b->buf); // code does not depend on this but it is true currently
3922
3923             quantize(s, b, dst, src, b->stride, s->qbias);
3924         }
3925     }
3926     for(pass=0; pass<1; pass++){
3927         if(s->qbias == 0) //keyframe
3928             continue;
3929         for(level=0; level<s->spatial_decomposition_count; level++){
3930             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3931                 SubBand *b= &p->band[level][orientation];
3932                 IDWTELEM *dst= idwt2_buffer + (b->ibuf - s->spatial_idwt_buffer);
3933                 IDWTELEM *best_dst= best_dequant + (b->ibuf - s->spatial_idwt_buffer);
3934
3935                 for(ys= 0; ys<Q2_STEP; ys++){
3936                     for(xs= 0; xs<Q2_STEP; xs++){
3937                         memcpy(idwt2_buffer, best_dequant, height * stride * sizeof(IDWTELEM));
3938                         dequantize_all(s, p, idwt2_buffer, width, height);
3939                         ff_spatial_idwt(idwt2_buffer, width, height, stride, type, s->spatial_decomposition_count);
3940                         find_sse(s, p, best_score, score_stride, idwt2_buffer, s->spatial_idwt_buffer, level, orientation);
3941                         memcpy(idwt2_buffer, best_dequant, height * stride * sizeof(IDWTELEM));
3942                         for(y=ys; y<b->height; y+= Q2_STEP){
3943                             for(x=xs; x<b->width; x+= Q2_STEP){
3944                                 if(dst[x + y*b->stride]<0) dst[x + y*b->stride]++;
3945                                 if(dst[x + y*b->stride]>0) dst[x + y*b->stride]--;
3946                                 //FIXME try more than just --
3947                             }
3948                         }
3949                         dequantize_all(s, p, idwt2_buffer, width, height);
3950                         ff_spatial_idwt(idwt2_buffer, width, height, stride, type, s->spatial_decomposition_count);
3951                         find_sse(s, p, score, score_stride, idwt2_buffer, s->spatial_idwt_buffer, level, orientation);
3952                         for(y=ys; y<b->height; y+= Q2_STEP){
3953                             for(x=xs; x<b->width; x+= Q2_STEP){
3954                                 int score_idx= x/Q2_STEP + (y/Q2_STEP)*score_stride;
3955                                 if(score[score_idx] <= best_score[score_idx] + threshold){
3956                                     best_score[score_idx]= score[score_idx];
3957                                     if(best_dst[x + y*b->stride]<0) best_dst[x + y*b->stride]++;
3958                                     if(best_dst[x + y*b->stride]>0) best_dst[x + y*b->stride]--;
3959                                     //FIXME copy instead
3960                                 }
3961                             }
3962                         }
3963                     }
3964                 }
3965             }
3966         }
3967     }
3968     memcpy(s->spatial_idwt_buffer, best_dequant, height * stride * sizeof(IDWTELEM)); //FIXME work with that directly instead of copy at the end
3969 }
3970
3971 #endif /* QUANTIZE2==1 */
3972
3973 static av_cold int encode_init(AVCodecContext *avctx)
3974 {
3975     SnowContext *s = avctx->priv_data;
3976     int plane_index;
3977
3978     if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3979         av_log(avctx, AV_LOG_ERROR, "This codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3980                "Use vstrict=-2 / -strict -2 to use it anyway.\n");
3981         return -1;
3982     }
3983
3984     if(avctx->prediction_method == DWT_97
3985        && (avctx->flags & CODEC_FLAG_QSCALE)
3986        && avctx->global_quality == 0){
3987         av_log(avctx, AV_LOG_ERROR, "The 9/7 wavelet is incompatible with lossless mode.\n");
3988         return -1;
3989     }
3990
3991     s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3992
3993     s->mv_scale       = (avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3994     s->block_max_depth= (avctx->flags & CODEC_FLAG_4MV ) ? 1 : 0;
3995
3996     for(plane_index=0; plane_index<3; plane_index++){
3997         s->plane[plane_index].diag_mc= 1;
3998         s->plane[plane_index].htaps= 6;
3999         s->plane[plane_index].hcoeff[0]=  40;
4000         s->plane[plane_index].hcoeff[1]= -10;
4001         s->plane[plane_index].hcoeff[2]=   2;
4002         s->plane[plane_index].fast_mc= 1;
4003     }
4004
4005     common_init(avctx);
4006     alloc_blocks(s);
4007
4008     s->version=0;
4009
4010     s->m.avctx   = avctx;
4011     s->m.flags   = avctx->flags;
4012     s->m.bit_rate= avctx->bit_rate;
4013
4014     s->m.me.temp      =
4015     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
4016     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
4017     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
4018     s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
4019     h263_encode_init(&s->m); //mv_penalty
4020
4021     s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
4022
4023     if(avctx->flags&CODEC_FLAG_PASS1){
4024         if(!avctx->stats_out)
4025             avctx->stats_out = av_mallocz(256);
4026     }
4027     if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
4028         if(ff_rate_control_init(&s->m) < 0)
4029             return -1;
4030     }
4031     s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
4032
4033     avctx->coded_frame= &s->current_picture;
4034     switch(avctx->pix_fmt){
4035 //    case PIX_FMT_YUV444P:
4036 //    case PIX_FMT_YUV422P:
4037     case PIX_FMT_YUV420P:
4038     case PIX_FMT_GRAY8:
4039 //    case PIX_FMT_YUV411P:
4040 //    case PIX_FMT_YUV410P:
4041         s->colorspace_type= 0;
4042         break;
4043 /*    case PIX_FMT_RGB32:
4044         s->colorspace= 1;
4045         break;*/
4046     default:
4047         av_log(avctx, AV_LOG_ERROR, "pixel format not supported\n");
4048         return -1;
4049     }
4050 //    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
4051     s->chroma_h_shift= 1;
4052     s->chroma_v_shift= 1;
4053
4054     ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4055     ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4056
4057     s->avctx->get_buffer(s->avctx, &s->input_picture);
4058
4059     if(s->avctx->me_method == ME_ITER){
4060         int i;
4061         int size= s->b_width * s->b_height << 2*s->block_max_depth;
4062         for(i=0; i<s->max_ref_frames; i++){
4063             s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4064             s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4065         }
4066     }
4067
4068     return 0;
4069 }
4070
4071 #define USE_HALFPEL_PLANE 0
4072
4073 static void halfpel_interpol(SnowContext *s, uint8_t *halfpel[4][4], AVFrame *frame){
4074     int p,x,y;
4075
4076     assert(!(s->avctx->flags & CODEC_FLAG_EMU_EDGE));
4077
4078     for(p=0; p<3; p++){
4079         int is_chroma= !!p;
4080         int w= s->avctx->width  >>is_chroma;
4081         int h= s->avctx->height >>is_chroma;
4082         int ls= frame->linesize[p];
4083         uint8_t *src= frame->data[p];
4084
4085         halfpel[1][p]= (uint8_t*)av_malloc(ls * (h+2*EDGE_WIDTH)) + EDGE_WIDTH*(1+ls);
4086         halfpel[2][p]= (uint8_t*)av_malloc(ls * (h+2*EDGE_WIDTH)) + EDGE_WIDTH*(1+ls);
4087         halfpel[3][p]= (uint8_t*)av_malloc(ls * (h+2*EDGE_WIDTH)) + EDGE_WIDTH*(1+ls);
4088
4089         halfpel[0][p]= src;
4090         for(y=0; y<h; y++){
4091             for(x=0; x<w; x++){
4092                 int i= y*ls + x;
4093
4094                 halfpel[1][p][i]= (20*(src[i] + src[i+1]) - 5*(src[i-1] + src[i+2]) + (src[i-2] + src[i+3]) + 16 )>>5;
4095             }
4096         }
4097         for(y=0; y<h; y++){
4098             for(x=0; x<w; x++){
4099                 int i= y*ls + x;
4100
4101                 halfpel[2][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
4102             }
4103         }
4104         src= halfpel[1][p];
4105         for(y=0; y<h; y++){
4106             for(x=0; x<w; x++){
4107                 int i= y*ls + x;
4108
4109                 halfpel[3][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
4110             }
4111         }
4112
4113 //FIXME border!
4114     }
4115 }
4116
4117 static void release_buffer(AVCodecContext *avctx){
4118     SnowContext *s = avctx->priv_data;
4119     int i;
4120
4121     if(s->last_picture[s->max_ref_frames-1].data[0]){
4122         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4123         for(i=0; i<9; i++)
4124             if(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3])
4125                 av_free(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] - EDGE_WIDTH*(1+s->current_picture.linesize[i%3]));
4126     }
4127 }
4128
4129 static int frame_start(SnowContext *s){
4130    AVFrame tmp;
4131    int w= s->avctx->width; //FIXME round up to x16 ?
4132    int h= s->avctx->height;
4133
4134     if(s->current_picture.data[0]){
4135         s->dsp.draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4136         s->dsp.draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4137         s->dsp.draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4138     }
4139
4140     release_buffer(s->avctx);
4141
4142     tmp= s->last_picture[s->max_ref_frames-1];
4143     memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4144     memmove(s->halfpel_plane+1, s->halfpel_plane, (s->max_ref_frames-1)*sizeof(void*)*4*4);
4145     if(USE_HALFPEL_PLANE && s->current_picture.data[0])
4146         halfpel_interpol(s, s->halfpel_plane[0], &s->current_picture);
4147     s->last_picture[0]= s->current_picture;
4148     s->current_picture= tmp;
4149
4150     if(s->keyframe){
4151         s->ref_frames= 0;
4152     }else{
4153         int i;
4154         for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4155             if(i && s->last_picture[i-1].key_frame)
4156                 break;
4157         s->ref_frames= i;
4158         if(s->ref_frames==0){
4159             av_log(s->avctx,AV_LOG_ERROR, "No reference frames\n");
4160             return -1;
4161         }
4162     }
4163
4164     s->current_picture.reference= 1;
4165     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
4166         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
4167         return -1;
4168     }
4169
4170     s->current_picture.key_frame= s->keyframe;
4171
4172     return 0;
4173 }
4174
4175 static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
4176     SnowContext *s = avctx->priv_data;
4177     RangeCoder * const c= &s->c;
4178     AVFrame *pict = data;
4179     const int width= s->avctx->width;
4180     const int height= s->avctx->height;
4181     int level, orientation, plane_index, i, y;
4182     uint8_t rc_header_bak[sizeof(s->header_state)];
4183     uint8_t rc_block_bak[sizeof(s->block_state)];
4184
4185     ff_init_range_encoder(c, buf, buf_size);
4186     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4187
4188     for(i=0; i<3; i++){
4189         int shift= !!i;
4190         for(y=0; y<(height>>shift); y++)
4191             memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
4192                    &pict->data[i][y * pict->linesize[i]],
4193                    width>>shift);
4194     }
4195     s->new_picture = *pict;
4196
4197     s->m.picture_number= avctx->frame_number;
4198     if(avctx->flags&CODEC_FLAG_PASS2){
4199         s->m.pict_type =
4200         pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
4201         s->keyframe= pict->pict_type==FF_I_TYPE;
4202         if(!(avctx->flags&CODEC_FLAG_QSCALE)) {
4203             pict->quality= ff_rate_estimate_qscale(&s->m, 0);
4204             if (pict->quality < 0)
4205                 return -1;
4206         }
4207     }else{
4208         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
4209         s->m.pict_type=
4210         pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
4211     }
4212
4213     if(s->pass1_rc && avctx->frame_number == 0)
4214         pict->quality= 2*FF_QP2LAMBDA;
4215     if(pict->quality){
4216         s->qlog= qscale2qlog(pict->quality);
4217         s->lambda = pict->quality * 3/2;
4218     }
4219     if(s->qlog < 0 || (!pict->quality && (avctx->flags & CODEC_FLAG_QSCALE))){
4220         s->qlog= LOSSLESS_QLOG;
4221         s->lambda = 0;
4222     }//else keep previous frame's qlog until after motion estimation
4223
4224     frame_start(s);
4225
4226     s->m.current_picture_ptr= &s->m.current_picture;
4227     if(pict->pict_type == FF_P_TYPE){
4228         int block_width = (width +15)>>4;
4229         int block_height= (height+15)>>4;
4230         int stride= s->current_picture.linesize[0];
4231
4232         assert(s->current_picture.data[0]);
4233         assert(s->last_picture[0].data[0]);
4234
4235         s->m.avctx= s->avctx;
4236         s->m.current_picture.data[0]= s->current_picture.data[0];
4237         s->m.   last_picture.data[0]= s->last_picture[0].data[0];
4238         s->m.    new_picture.data[0]= s->  input_picture.data[0];
4239         s->m.   last_picture_ptr= &s->m.   last_picture;
4240         s->m.linesize=
4241         s->m.   last_picture.linesize[0]=
4242         s->m.    new_picture.linesize[0]=
4243         s->m.current_picture.linesize[0]= stride;
4244         s->m.uvlinesize= s->current_picture.linesize[1];
4245         s->m.width = width;
4246         s->m.height= height;
4247         s->m.mb_width = block_width;
4248         s->m.mb_height= block_height;
4249         s->m.mb_stride=   s->m.mb_width+1;
4250         s->m.b8_stride= 2*s->m.mb_width+1;
4251         s->m.f_code=1;
4252         s->m.pict_type= pict->pict_type;
4253         s->m.me_method= s->avctx->me_method;
4254         s->m.me.scene_change_score=0;
4255         s->m.flags= s->avctx->flags;
4256         s->m.quarter_sample= (s->avctx->flags & CODEC_FLAG_QPEL)!=0;
4257         s->m.out_format= FMT_H263;
4258         s->m.unrestricted_mv= 1;
4259
4260         s->m.lambda = s->lambda;
4261         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
4262         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
4263
4264         s->m.dsp= s->dsp; //move
4265         ff_init_me(&s->m);
4266         s->dsp= s->m.dsp;
4267     }
4268
4269     if(s->pass1_rc){
4270         memcpy(rc_header_bak, s->header_state, sizeof(s->header_state));
4271         memcpy(rc_block_bak, s->block_state, sizeof(s->block_state));
4272     }
4273
4274 redo_frame:
4275
4276     if(pict->pict_type == FF_I_TYPE)
4277         s->spatial_decomposition_count= 5;
4278     else
4279         s->spatial_decomposition_count= 5;
4280
4281     s->m.pict_type = pict->pict_type;
4282     s->qbias= pict->pict_type == FF_P_TYPE ? 2 : 0;
4283
4284     common_init_after_header(avctx);
4285
4286     if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
4287         for(plane_index=0; plane_index<3; plane_index++){
4288             calculate_visual_weight(s, &s->plane[plane_index]);
4289         }
4290     }
4291
4292     encode_header(s);
4293     s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4294     encode_blocks(s, 1);
4295     s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
4296
4297     for(plane_index=0; plane_index<3; plane_index++){
4298         Plane *p= &s->plane[plane_index];
4299         int w= p->width;
4300         int h= p->height;
4301         int x, y;
4302 //        int bits= put_bits_count(&s->c.pb);
4303
4304         if(!(avctx->flags2 & CODEC_FLAG2_MEMC_ONLY)){
4305             //FIXME optimize
4306             if(pict->data[plane_index]) //FIXME gray hack
4307                 for(y=0; y<h; y++){
4308                     for(x=0; x<w; x++){
4309                         s->spatial_idwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
4310                     }
4311                 }
4312             predict_plane(s, s->spatial_idwt_buffer, plane_index, 0);
4313
4314             if(   plane_index==0
4315                && pict->pict_type == FF_P_TYPE
4316                && !(avctx->flags&CODEC_FLAG_PASS2)
4317                && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
4318                 ff_init_range_encoder(c, buf, buf_size);
4319                 ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4320                 pict->pict_type= FF_I_TYPE;
4321                 s->keyframe=1;
4322                 s->current_picture.key_frame=1;
4323                 goto redo_frame;
4324             }
4325
4326             if(s->qlog == LOSSLESS_QLOG){
4327                 for(y=0; y<h; y++){
4328                     for(x=0; x<w; x++){
4329                         s->spatial_dwt_buffer[y*w + x]= (s->spatial_idwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
4330                     }
4331                 }
4332             }else{
4333                 for(y=0; y<h; y++){
4334                     for(x=0; x<w; x++){
4335                         s->spatial_dwt_buffer[y*w + x]=s->spatial_idwt_buffer[y*w + x]<<ENCODER_EXTRA_BITS;
4336                     }
4337                 }
4338             }
4339
4340             /*  if(QUANTIZE2)
4341                 dwt_quantize(s, p, s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type);
4342             else*/
4343                 ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4344
4345             if(s->pass1_rc && plane_index==0){
4346                 int delta_qlog = ratecontrol_1pass(s, pict);
4347                 if (delta_qlog <= INT_MIN)
4348                     return -1;
4349                 if(delta_qlog){
4350                     //reordering qlog in the bitstream would eliminate this reset
4351                     ff_init_range_encoder(c, buf, buf_size);
4352                     memcpy(s->header_state, rc_header_bak, sizeof(s->header_state));
4353                     memcpy(s->block_state, rc_block_bak, sizeof(s->block_state));
4354                     encode_header(s);
4355                     encode_blocks(s, 0);
4356                 }
4357             }
4358
4359             for(level=0; level<s->spatial_decomposition_count; level++){
4360                 for(orientation=level ? 1 : 0; orientation<4; orientation++){
4361                     SubBand *b= &p->band[level][orientation];
4362
4363                     if(!QUANTIZE2)
4364                         quantize(s, b, b->ibuf, b->buf, b->stride, s->qbias);
4365                     if(orientation==0)
4366                         decorrelate(s, b, b->ibuf, b->stride, pict->pict_type == FF_P_TYPE, 0);
4367                     encode_subband(s, b, b->ibuf, b->parent ? b->parent->ibuf : NULL, b->stride, orientation);
4368                     assert(b->parent==NULL || b->parent->stride == b->stride*2);
4369                     if(orientation==0)
4370                         correlate(s, b, b->ibuf, b->stride, 1, 0);
4371                 }
4372             }
4373
4374             for(level=0; level<s->spatial_decomposition_count; level++){
4375                 for(orientation=level ? 1 : 0; orientation<4; orientation++){
4376                     SubBand *b= &p->band[level][orientation];
4377
4378                     dequantize(s, b, b->ibuf, b->stride);
4379                 }
4380             }
4381
4382             ff_spatial_idwt(s->spatial_idwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4383             if(s->qlog == LOSSLESS_QLOG){
4384                 for(y=0; y<h; y++){
4385                     for(x=0; x<w; x++){
4386                         s->spatial_idwt_buffer[y*w + x]<<=FRAC_BITS;
4387                     }
4388                 }
4389             }
4390             predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
4391         }else{
4392             //ME/MC only
4393             if(pict->pict_type == FF_I_TYPE){
4394                 for(y=0; y<h; y++){
4395                     for(x=0; x<w; x++){
4396                         s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x]=
4397                             pict->data[plane_index][y*pict->linesize[plane_index] + x];
4398                     }
4399                 }
4400             }else{
4401                 memset(s->spatial_idwt_buffer, 0, sizeof(IDWTELEM)*w*h);
4402                 predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
4403             }
4404         }
4405         if(s->avctx->flags&CODEC_FLAG_PSNR){
4406             int64_t error= 0;
4407
4408             if(pict->data[plane_index]) //FIXME gray hack
4409                 for(y=0; y<h; y++){
4410                     for(x=0; x<w; x++){
4411                         int d= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
4412                         error += d*d;
4413                     }
4414                 }
4415             s->avctx->error[plane_index] += error;
4416             s->current_picture.error[plane_index] = error;
4417         }
4418
4419     }
4420
4421     update_last_header_values(s);
4422
4423     release_buffer(avctx);
4424
4425     s->current_picture.coded_picture_number = avctx->frame_number;
4426     s->current_picture.pict_type = pict->pict_type;
4427     s->current_picture.quality = pict->quality;
4428     s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4429     s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
4430     s->m.current_picture.display_picture_number =
4431     s->m.current_picture.coded_picture_number = avctx->frame_number;
4432     s->m.current_picture.quality = pict->quality;
4433     s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
4434     if(s->pass1_rc)
4435         if (ff_rate_estimate_qscale(&s->m, 0) < 0)
4436             return -1;
4437     if(avctx->flags&CODEC_FLAG_PASS1)
4438         ff_write_pass1_stats(&s->m);
4439     s->m.last_pict_type = s->m.pict_type;
4440     avctx->frame_bits = s->m.frame_bits;
4441     avctx->mv_bits = s->m.mv_bits;
4442     avctx->misc_bits = s->m.misc_bits;
4443     avctx->p_tex_bits = s->m.p_tex_bits;
4444
4445     emms_c();
4446
4447     return ff_rac_terminate(c);
4448 }
4449
4450 static av_cold void common_end(SnowContext *s){
4451     int plane_index, level, orientation, i;
4452
4453     av_freep(&s->spatial_dwt_buffer);
4454     av_freep(&s->spatial_idwt_buffer);
4455
4456     s->m.me.temp= NULL;
4457     av_freep(&s->m.me.scratchpad);
4458     av_freep(&s->m.me.map);
4459     av_freep(&s->m.me.score_map);
4460     av_freep(&s->m.obmc_scratchpad);
4461
4462     av_freep(&s->block);
4463     av_freep(&s->scratchbuf);
4464
4465     for(i=0; i<MAX_REF_FRAMES; i++){
4466         av_freep(&s->ref_mvs[i]);
4467         av_freep(&s->ref_scores[i]);
4468         if(s->last_picture[i].data[0])
4469             s->avctx->release_buffer(s->avctx, &s->last_picture[i]);
4470     }
4471
4472     for(plane_index=0; plane_index<3; plane_index++){
4473         for(level=s->spatial_decomposition_count-1; level>=0; level--){
4474             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4475                 SubBand *b= &s->plane[plane_index].band[level][orientation];
4476
4477                 av_freep(&b->x_coeff);
4478             }
4479         }
4480     }
4481 }
4482
4483 static av_cold int encode_end(AVCodecContext *avctx)
4484 {
4485     SnowContext *s = avctx->priv_data;
4486
4487     common_end(s);
4488     av_free(avctx->stats_out);
4489
4490     return 0;
4491 }
4492
4493 static av_cold int decode_init(AVCodecContext *avctx)
4494 {
4495     avctx->pix_fmt= PIX_FMT_YUV420P;
4496
4497     common_init(avctx);
4498
4499     return 0;
4500 }
4501
4502 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt){
4503     const uint8_t *buf = avpkt->data;
4504     int buf_size = avpkt->size;
4505     SnowContext *s = avctx->priv_data;
4506     RangeCoder * const c= &s->c;
4507     int bytes_read;
4508     AVFrame *picture = data;
4509     int level, orientation, plane_index;
4510
4511     ff_init_range_decoder(c, buf, buf_size);
4512     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4513
4514     s->current_picture.pict_type= FF_I_TYPE; //FIXME I vs. P
4515     if(decode_header(s)<0)
4516         return -1;
4517     common_init_after_header(avctx);
4518
4519     // realloc slice buffer for the case that spatial_decomposition_count changed
4520     slice_buffer_destroy(&s->sb);
4521     slice_buffer_init(&s->sb, s->plane[0].height, (MB_SIZE >> s->block_max_depth) + s->spatial_decomposition_count * 8 + 1, s->plane[0].width, s->spatial_idwt_buffer);
4522
4523     for(plane_index=0; plane_index<3; plane_index++){
4524         Plane *p= &s->plane[plane_index];
4525         p->fast_mc= p->diag_mc && p->htaps==6 && p->hcoeff[0]==40
4526                                               && p->hcoeff[1]==-10
4527                                               && p->hcoeff[2]==2;
4528     }
4529
4530     alloc_blocks(s);
4531
4532     if(frame_start(s) < 0)
4533         return -1;
4534     //keyframe flag duplication mess FIXME
4535     if(avctx->debug&FF_DEBUG_PICT_INFO)
4536         av_log(avctx, AV_LOG_ERROR, "keyframe:%d qlog:%d\n", s->keyframe, s->qlog);
4537
4538     decode_blocks(s);
4539
4540     for(plane_index=0; plane_index<3; plane_index++){
4541         Plane *p= &s->plane[plane_index];
4542         int w= p->width;
4543         int h= p->height;
4544         int x, y;
4545         int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
4546
4547         if(s->avctx->debug&2048){
4548             memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
4549             predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
4550
4551             for(y=0; y<h; y++){
4552                 for(x=0; x<w; x++){
4553                     int v= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x];
4554                     s->mconly_picture.data[plane_index][y*s->mconly_picture.linesize[plane_index] + x]= v;
4555                 }
4556             }
4557         }
4558
4559         {
4560         for(level=0; level<s->spatial_decomposition_count; level++){
4561             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4562                 SubBand *b= &p->band[level][orientation];
4563                 unpack_coeffs(s, b, b->parent, orientation);
4564             }
4565         }
4566         }
4567
4568         {
4569         const int mb_h= s->b_height << s->block_max_depth;
4570         const int block_size = MB_SIZE >> s->block_max_depth;
4571         const int block_w    = plane_index ? block_size/2 : block_size;
4572         int mb_y;
4573         DWTCompose cs[MAX_DECOMPOSITIONS];
4574         int yd=0, yq=0;
4575         int y;
4576         int end_y;
4577
4578         ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
4579         for(mb_y=0; mb_y<=mb_h; mb_y++){
4580
4581             int slice_starty = block_w*mb_y;
4582             int slice_h = block_w*(mb_y+1);
4583             if (!(s->keyframe || s->avctx->debug&512)){
4584                 slice_starty = FFMAX(0, slice_starty - (block_w >> 1));
4585                 slice_h -= (block_w >> 1);
4586             }
4587
4588             for(level=0; level<s->spatial_decomposition_count; level++){
4589                 for(orientation=level ? 1 : 0; orientation<4; orientation++){
4590                     SubBand *b= &p->band[level][orientation];
4591                     int start_y;
4592                     int end_y;
4593                     int our_mb_start = mb_y;
4594                     int our_mb_end = (mb_y + 1);
4595                     const int extra= 3;
4596                     start_y = (mb_y ? ((block_w * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
4597                     end_y = (((block_w * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
4598                     if (!(s->keyframe || s->avctx->debug&512)){
4599                         start_y = FFMAX(0, start_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4600                         end_y = FFMAX(0, end_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4601                     }
4602                     start_y = FFMIN(b->height, start_y);
4603                     end_y = FFMIN(b->height, end_y);
4604
4605                     if (start_y != end_y){
4606                         if (orientation == 0){
4607                             SubBand * correlate_band = &p->band[0][0];
4608                             int correlate_end_y = FFMIN(b->height, end_y + 1);
4609                             int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
4610                             decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
4611                             correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
4612                             dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, start_y, end_y);
4613                         }
4614                         else
4615                             decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
4616                     }
4617                 }
4618             }
4619
4620             for(; yd<slice_h; yd+=4){
4621                 ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
4622             }
4623
4624             if(s->qlog == LOSSLESS_QLOG){
4625                 for(; yq<slice_h && yq<h; yq++){
4626                     IDWTELEM * line = slice_buffer_get_line(&s->sb, yq);
4627                     for(x=0; x<w; x++){
4628                         line[x] <<= FRAC_BITS;
4629                     }
4630                 }
4631             }
4632
4633             predict_slice_buffered(s, &s->sb, s->spatial_idwt_buffer, plane_index, 1, mb_y);
4634
4635             y = FFMIN(p->height, slice_starty);
4636             end_y = FFMIN(p->height, slice_h);
4637             while(y < end_y)
4638                 slice_buffer_release(&s->sb, y++);
4639         }
4640
4641         slice_buffer_flush(&s->sb);
4642         }
4643
4644     }
4645
4646     emms_c();
4647
4648     release_buffer(avctx);
4649
4650     if(!(s->avctx->debug&2048))
4651         *picture= s->current_picture;
4652     else
4653         *picture= s->mconly_picture;
4654
4655     *data_size = sizeof(AVFrame);
4656
4657     bytes_read= c->bytestream - c->bytestream_start;
4658     if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
4659
4660     return bytes_read;
4661 }
4662
4663 static av_cold int decode_end(AVCodecContext *avctx)
4664 {
4665     SnowContext *s = avctx->priv_data;
4666
4667     slice_buffer_destroy(&s->sb);
4668
4669     common_end(s);
4670
4671     return 0;
4672 }
4673
4674 AVCodec snow_decoder = {
4675     "snow",
4676     CODEC_TYPE_VIDEO,
4677     CODEC_ID_SNOW,
4678     sizeof(SnowContext),
4679     decode_init,
4680     NULL,
4681     decode_end,
4682     decode_frame,
4683     CODEC_CAP_DR1 /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
4684     NULL,
4685     .long_name = NULL_IF_CONFIG_SMALL("Snow"),
4686 };
4687
4688 #if CONFIG_SNOW_ENCODER
4689 AVCodec snow_encoder = {
4690     "snow",
4691     CODEC_TYPE_VIDEO,
4692     CODEC_ID_SNOW,
4693     sizeof(SnowContext),
4694     encode_init,
4695     encode_frame,
4696     encode_end,
4697     .long_name = NULL_IF_CONFIG_SMALL("Snow"),
4698 };
4699 #endif
4700
4701
4702 #ifdef TEST
4703 #undef malloc
4704 #undef free
4705 #undef printf
4706
4707 #include "libavutil/lfg.h"
4708
4709 int main(void){
4710     int width=256;
4711     int height=256;
4712     int buffer[2][width*height];
4713     SnowContext s;
4714     int i;
4715     AVLFG prng;
4716     s.spatial_decomposition_count=6;
4717     s.spatial_decomposition_type=1;
4718
4719     av_lfg_init(&prng, 1);
4720
4721     printf("testing 5/3 DWT\n");
4722     for(i=0; i<width*height; i++)
4723         buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
4724
4725     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4726     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4727
4728     for(i=0; i<width*height; i++)
4729         if(buffer[0][i]!= buffer[1][i]) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
4730
4731     printf("testing 9/7 DWT\n");
4732     s.spatial_decomposition_type=0;
4733     for(i=0; i<width*height; i++)
4734         buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
4735
4736     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4737     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4738
4739     for(i=0; i<width*height; i++)
4740         if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
4741
4742 #if 0
4743     printf("testing AC coder\n");
4744     memset(s.header_state, 0, sizeof(s.header_state));
4745     ff_init_range_encoder(&s.c, buffer[0], 256*256);
4746     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4747
4748     for(i=-256; i<256; i++){
4749         put_symbol(&s.c, s.header_state, i*i*i/3*FFABS(i), 1);
4750     }
4751     ff_rac_terminate(&s.c);
4752
4753     memset(s.header_state, 0, sizeof(s.header_state));
4754     ff_init_range_decoder(&s.c, buffer[0], 256*256);
4755     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4756
4757     for(i=-256; i<256; i++){
4758         int j;
4759         j= get_symbol(&s.c, s.header_state, 1);
4760         if(j!=i*i*i/3*FFABS(i)) printf("fsck: %d != %d\n", i, j);
4761     }
4762 #endif
4763     {
4764     int level, orientation, x, y;
4765     int64_t errors[8][4];
4766     int64_t g=0;
4767
4768         memset(errors, 0, sizeof(errors));
4769         s.spatial_decomposition_count=3;
4770         s.spatial_decomposition_type=0;
4771         for(level=0; level<s.spatial_decomposition_count; level++){
4772             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4773                 int w= width  >> (s.spatial_decomposition_count-level);
4774                 int h= height >> (s.spatial_decomposition_count-level);
4775                 int stride= width  << (s.spatial_decomposition_count-level);
4776                 DWTELEM *buf= buffer[0];
4777                 int64_t error=0;
4778
4779                 if(orientation&1) buf+=w;
4780                 if(orientation>1) buf+=stride>>1;
4781
4782                 memset(buffer[0], 0, sizeof(int)*width*height);
4783                 buf[w/2 + h/2*stride]= 256*256;
4784                 ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4785                 for(y=0; y<height; y++){
4786                     for(x=0; x<width; x++){
4787                         int64_t d= buffer[0][x + y*width];
4788                         error += d*d;
4789                         if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9 && level==2) printf("%8"PRId64" ", d);
4790                     }
4791                     if(FFABS(height/2-y)<9 && level==2) printf("\n");
4792                 }
4793                 error= (int)(sqrt(error)+0.5);
4794                 errors[level][orientation]= error;
4795                 if(g) g=av_gcd(g, error);
4796                 else g= error;
4797             }
4798         }
4799         printf("static int const visual_weight[][4]={\n");
4800         for(level=0; level<s.spatial_decomposition_count; level++){
4801             printf("  {");
4802             for(orientation=0; orientation<4; orientation++){
4803                 printf("%8"PRId64",", errors[level][orientation]/g);
4804             }
4805             printf("},\n");
4806         }
4807         printf("};\n");
4808         {
4809             int level=2;
4810             int w= width  >> (s.spatial_decomposition_count-level);
4811             //int h= height >> (s.spatial_decomposition_count-level);
4812             int stride= width  << (s.spatial_decomposition_count-level);
4813             DWTELEM *buf= buffer[0];
4814             int64_t error=0;
4815
4816             buf+=w;
4817             buf+=stride>>1;
4818
4819             memset(buffer[0], 0, sizeof(int)*width*height);
4820 #if 1
4821             for(y=0; y<height; y++){
4822                 for(x=0; x<width; x++){
4823                     int tab[4]={0,2,3,1};
4824                     buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
4825                 }
4826             }
4827             ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4828 #else
4829             for(y=0; y<h; y++){
4830                 for(x=0; x<w; x++){
4831                     buf[x + y*stride  ]=169;
4832                     buf[x + y*stride-w]=64;
4833                 }
4834             }
4835             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4836 #endif
4837             for(y=0; y<height; y++){
4838                 for(x=0; x<width; x++){
4839                     int64_t d= buffer[0][x + y*width];
4840                     error += d*d;
4841                     if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9) printf("%8"PRId64" ", d);
4842                 }
4843                 if(FFABS(height/2-y)<9) printf("\n");
4844             }
4845         }
4846
4847     }
4848     return 0;
4849 }
4850 #endif /* TEST */