libavcodec/snow.c

   1 /*
   2  * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "avcodec.h"
  22 #include "dsputil.h"
  23 #include "snow.h"
  24
  25 #include "rangecoder.h"
  26
  27 #include "mpegvideo.h"
  28
  29 #undef NDEBUG
  30 #include <assert.h>
  31
  32 static const int8_t quant3[256]={
  33  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  41 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  42 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  43 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  44 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  45 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  46 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  47 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  48 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
  49 };
  50 static const int8_t quant3b[256]={
  51  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  58  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  59 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  60 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  61 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  62 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  63 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  64 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  65 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  66 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  67 };
  68 static const int8_t quant3bA[256]={
  69  0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  70  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  71  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  72  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  73  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  74  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  75  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  76  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  77  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  78  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  79  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  80  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  81  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  82  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  83  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  84  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  85 };
  86 static const int8_t quant5[256]={
  87  0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  88  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  89  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  90  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  91  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  92  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  95 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  96 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  97 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  98 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  99 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 100 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 101 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 102 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
 103 };
 104 static const int8_t quant7[256]={
 105  0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 106  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 107  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
 108  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 109  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 110  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 111  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 112  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 113 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 114 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 115 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 116 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 117 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 118 -3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
 119 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 120 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
 121 };
 122 static const int8_t quant9[256]={
 123  0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 124  3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 125  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 126  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 127  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 128  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 129  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 130  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 131 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 132 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 133 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 134 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 135 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 136 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 137 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
 138 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
 139 };
 140 static const int8_t quant11[256]={
 141  0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
 142  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 143  4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 144  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 145  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 146  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 147  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 148  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 149 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 150 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 151 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 152 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 153 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 154 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
 155 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 156 -4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
 157 };
 158 static const int8_t quant13[256]={
 159  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
 160  4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 161  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 162  5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 163  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 164  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 165  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 166  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 167 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 168 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 169 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 170 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 171 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
 172 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 173 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 174 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 175 };
 176
 177 #if 0 //64*cubic
 178 static const uint8_t obmc32[1024]={
 179  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 180  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 181  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 182  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 183  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 184  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 185  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 186  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 187  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 188  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 189  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 190  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 191  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 192  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 193  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 194  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 195  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 196  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 197  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 198  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 199  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 200  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 201  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 202  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 203  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 204  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 205  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 206  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 207  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 208  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 209  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 210  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 211 //error:0.000022
 212 };
 213 static const uint8_t obmc16[256]={
 214  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 215  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 216  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 217  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 218  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 219  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 220  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 221  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 222  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 223  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 224  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 225  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 226  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 227  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 228  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 229  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 230 //error:0.000033
 231 };
 232 #elif 1 // 64*linear
 233 static const uint8_t obmc32[1024]={
 234   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 235   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 236   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 237   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 238   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 239   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 240   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 241   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 242   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 243   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 244   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 245   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 246   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 247   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 248   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 249   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 250   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 251   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 252   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 253   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 254   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 255   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 256   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 257   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 258   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 259   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 260   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 261   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 262   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 263   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 264   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 265   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 266  //error:0.000020
 267 };
 268 static const uint8_t obmc16[256]={
 269   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 270   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 271   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 272   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 273   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 274  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 275  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 276  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 277  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 278  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 279  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 280   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 281   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 282   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 283   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 284   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 285 //error:0.000015
 286 };
 287 #else //64*cos
 288 static const uint8_t obmc32[1024]={
 289  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 290  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 291  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 292  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 293  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 294  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 295  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 296  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 297  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 298  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 299  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 300  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 301  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 302  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 303  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 304  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 305  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 306  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 307  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 308  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 309  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 310  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 311  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 312  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 313  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 314  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 315  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 316  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 317  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 318  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 319  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 320  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 321 //error:0.000022
 322 };
 323 static const uint8_t obmc16[256]={
 324  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 325  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 326  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 327  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 328  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 329  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 330  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 331  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 332  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 333  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 334  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 335  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 336  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 337  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 338  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 339  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 340 //error:0.000022
 341 };
 342 #endif
 343
 344 //linear *64
 345 static const uint8_t obmc8[64]={
 346   4, 12, 20, 28, 28, 20, 12,  4,
 347  12, 36, 60, 84, 84, 60, 36, 12,
 348  20, 60,100,140,140,100, 60, 20,
 349  28, 84,140,196,196,140, 84, 28,
 350  28, 84,140,196,196,140, 84, 28,
 351  20, 60,100,140,140,100, 60, 20,
 352  12, 36, 60, 84, 84, 60, 36, 12,
 353   4, 12, 20, 28, 28, 20, 12,  4,
 354 //error:0.000000
 355 };
 356
 357 //linear *64
 358 static const uint8_t obmc4[16]={
 359  16, 48, 48, 16,
 360  48,144,144, 48,
 361  48,144,144, 48,
 362  16, 48, 48, 16,
 363 //error:0.000000
 364 };
 365
 366 static const uint8_t *obmc_tab[4]={
 367     obmc32, obmc16, obmc8, obmc4
 368 };
 369
 370 static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
 371
 372 typedef struct BlockNode{
 373     int16_t mx;
 374     int16_t my;
 375     uint8_t ref;
 376     uint8_t color[3];
 377     uint8_t type;
 378 //#define TYPE_SPLIT    1
 379 #define BLOCK_INTRA   1
 380 #define BLOCK_OPT     2
 381 //#define TYPE_NOCOLOR  4
 382     uint8_t level; //FIXME merge into type?
 383 }BlockNode;
 384
 385 static const BlockNode null_block= { //FIXME add border maybe
 386     .color= {128,128,128},
 387     .mx= 0,
 388     .my= 0,
 389     .ref= 0,
 390     .type= 0,
 391     .level= 0,
 392 };
 393
 394 #define LOG2_MB_SIZE 4
 395 #define MB_SIZE (1<<LOG2_MB_SIZE)
 396
 397 typedef struct x_and_coeff{
 398     int16_t x;
 399     uint16_t coeff;
 400 } x_and_coeff;
 401
 402 typedef struct SubBand{
 403     int level;
 404     int stride;
 405     int width;
 406     int height;
 407     int qlog;                                   ///< log(qscale)/log[2^(1/6)]
 408     DWTELEM *buf;
 409     int buf_x_offset;
 410     int buf_y_offset;
 411     int stride_line; ///< Stride measured in lines, not pixels.
 412     x_and_coeff * x_coeff;
 413     struct SubBand *parent;
 414     uint8_t state[/*7*2*/ 7 + 512][32];
 415 }SubBand;
 416
 417 typedef struct Plane{
 418     int width;
 419     int height;
 420     SubBand band[MAX_DECOMPOSITIONS][4];
 421 }Plane;
 422
 423 typedef struct SnowContext{
 424 //    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
 425
 426     AVCodecContext *avctx;
 427     RangeCoder c;
 428     DSPContext dsp;
 429     AVFrame new_picture;
 430     AVFrame input_picture;              ///< new_picture with the internal linesizes
 431     AVFrame current_picture;
 432     AVFrame last_picture[MAX_REF_FRAMES];
 433     AVFrame mconly_picture;
 434 //     uint8_t q_context[16];
 435     uint8_t header_state[32];
 436     uint8_t block_state[128 + 32*128];
 437     int keyframe;
 438     int always_reset;
 439     int version;
 440     int spatial_decomposition_type;
 441     int last_spatial_decomposition_type;
 442     int temporal_decomposition_type;
 443     int spatial_decomposition_count;
 444     int temporal_decomposition_count;
 445     int max_ref_frames;
 446     int ref_frames;
 447     int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
 448     uint32_t *ref_scores[MAX_REF_FRAMES];
 449     DWTELEM *spatial_dwt_buffer;
 450     int colorspace_type;
 451     int chroma_h_shift;
 452     int chroma_v_shift;
 453     int spatial_scalability;
 454     int qlog;
 455     int last_qlog;
 456     int lambda;
 457     int lambda2;
 458     int pass1_rc;
 459     int mv_scale;
 460     int last_mv_scale;
 461     int qbias;
 462     int last_qbias;
 463 #define QBIAS_SHIFT 3
 464     int b_width;
 465     int b_height;
 466     int block_max_depth;
 467     int last_block_max_depth;
 468     Plane plane[MAX_PLANES];
 469     BlockNode *block;
 470 #define ME_CACHE_SIZE 1024
 471     int me_cache[ME_CACHE_SIZE];
 472     int me_cache_generation;
 473     slice_buffer sb;
 474
 475     MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
 476 }SnowContext;
 477
 478 typedef struct {
 479     DWTELEM *b0;
 480     DWTELEM *b1;
 481     DWTELEM *b2;
 482     DWTELEM *b3;
 483     int y;
 484 } dwt_compose_t;
 485
 486 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
 487 //#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
 488
 489 static void iterative_me(SnowContext *s);
 490
 491 static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
 492 {
 493     int i;
 494
 495     buf->base_buffer = base_buffer;
 496     buf->line_count = line_count;
 497     buf->line_width = line_width;
 498     buf->data_count = max_allocated_lines;
 499     buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
 500     buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
 501
 502     for (i = 0; i < max_allocated_lines; i++)
 503     {
 504       buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
 505     }
 506
 507     buf->data_stack_top = max_allocated_lines - 1;
 508 }
 509
 510 static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
 511 {
 512     int offset;
 513     DWTELEM * buffer;
 514
 515 //  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
 516
 517     assert(buf->data_stack_top >= 0);
 518 //  assert(!buf->line[line]);
 519     if (buf->line[line])
 520         return buf->line[line];
 521
 522     offset = buf->line_width * line;
 523     buffer = buf->data_stack[buf->data_stack_top];
 524     buf->data_stack_top--;
 525     buf->line[line] = buffer;
 526
 527 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 528
 529     return buffer;
 530 }
 531
 532 static void slice_buffer_release(slice_buffer * buf, int line)
 533 {
 534     int offset;
 535     DWTELEM * buffer;
 536
 537     assert(line >= 0 && line < buf->line_count);
 538     assert(buf->line[line]);
 539
 540     offset = buf->line_width * line;
 541     buffer = buf->line[line];
 542     buf->data_stack_top++;
 543     buf->data_stack[buf->data_stack_top] = buffer;
 544     buf->line[line] = NULL;
 545
 546 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 547 }
 548
 549 static void slice_buffer_flush(slice_buffer * buf)
 550 {
 551     int i;
 552     for (i = 0; i < buf->line_count; i++)
 553     {
 554         if (buf->line[i])
 555         {
 556 //      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
 557             slice_buffer_release(buf, i);
 558         }
 559     }
 560 }
 561
 562 static void slice_buffer_destroy(slice_buffer * buf)
 563 {
 564     int i;
 565     slice_buffer_flush(buf);
 566
 567     for (i = buf->data_count - 1; i >= 0; i--)
 568     {
 569         assert(buf->data_stack[i]);
 570         av_freep(&buf->data_stack[i]);
 571     }
 572     assert(buf->data_stack);
 573     av_freep(&buf->data_stack);
 574     assert(buf->line);
 575     av_freep(&buf->line);
 576 }
 577
 578 #ifdef __sgi
 579 // Avoid a name clash on SGI IRIX
 580 #undef qexp
 581 #endif
 582 #define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
 583 static uint8_t qexp[QROOT];
 584
 585 static inline int mirror(int v, int m){
 586     while((unsigned)v > (unsigned)m){
 587         v=-v;
 588         if(v<0) v+= 2*m;
 589     }
 590     return v;
 591 }
 592
 593 static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
 594     int i;
 595
 596     if(v){
 597         const int a= FFABS(v);
 598         const int e= av_log2(a);
 599 #if 1
 600         const int el= FFMIN(e, 10);
 601         put_rac(c, state+0, 0);
 602
 603         for(i=0; i<el; i++){
 604             put_rac(c, state+1+i, 1);  //1..10
 605         }
 606         for(; i<e; i++){
 607             put_rac(c, state+1+9, 1);  //1..10
 608         }
 609         put_rac(c, state+1+FFMIN(i,9), 0);
 610
 611         for(i=e-1; i>=el; i--){
 612             put_rac(c, state+22+9, (a>>i)&1); //22..31
 613         }
 614         for(; i>=0; i--){
 615             put_rac(c, state+22+i, (a>>i)&1); //22..31
 616         }
 617
 618         if(is_signed)
 619             put_rac(c, state+11 + el, v < 0); //11..21
 620 #else
 621
 622         put_rac(c, state+0, 0);
 623         if(e<=9){
 624             for(i=0; i<e; i++){
 625                 put_rac(c, state+1+i, 1);  //1..10
 626             }
 627             put_rac(c, state+1+i, 0);
 628
 629             for(i=e-1; i>=0; i--){
 630                 put_rac(c, state+22+i, (a>>i)&1); //22..31
 631             }
 632
 633             if(is_signed)
 634                 put_rac(c, state+11 + e, v < 0); //11..21
 635         }else{
 636             for(i=0; i<e; i++){
 637                 put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
 638             }
 639             put_rac(c, state+1+FFMIN(i,9), 0);
 640
 641             for(i=e-1; i>=0; i--){
 642                 put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
 643             }
 644
 645             if(is_signed)
 646                 put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
 647         }
 648 #endif
 649     }else{
 650         put_rac(c, state+0, 1);
 651     }
 652 }
 653
 654 static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
 655     if(get_rac(c, state+0))
 656         return 0;
 657     else{
 658         int i, e, a;
 659         e= 0;
 660         while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
 661             e++;
 662         }
 663
 664         a= 1;
 665         for(i=e-1; i>=0; i--){
 666             a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
 667         }
 668
 669         if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
 670             return -a;
 671         else
 672             return a;
 673     }
 674 }
 675
 676 static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
 677     int i;
 678     int r= log2>=0 ? 1<<log2 : 1;
 679
 680     assert(v>=0);
 681     assert(log2>=-4);
 682
 683     while(v >= r){
 684         put_rac(c, state+4+log2, 1);
 685         v -= r;
 686         log2++;
 687         if(log2>0) r+=r;
 688     }
 689     put_rac(c, state+4+log2, 0);
 690
 691     for(i=log2-1; i>=0; i--){
 692         put_rac(c, state+31-i, (v>>i)&1);
 693     }
 694 }
 695
 696 static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
 697     int i;
 698     int r= log2>=0 ? 1<<log2 : 1;
 699     int v=0;
 700
 701     assert(log2>=-4);
 702
 703     while(get_rac(c, state+4+log2)){
 704         v+= r;
 705         log2++;
 706         if(log2>0) r+=r;
 707     }
 708
 709     for(i=log2-1; i>=0; i--){
 710         v+= get_rac(c, state+31-i)<<i;
 711     }
 712
 713     return v;
 714 }
 715
 716 static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 717     const int mirror_left= !highpass;
 718     const int mirror_right= (width&1) ^ highpass;
 719     const int w= (width>>1) - 1 + (highpass & width);
 720     int i;
 721
 722 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 723     if(mirror_left){
 724         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 725         dst += dst_step;
 726         src += src_step;
 727     }
 728
 729     for(i=0; i<w; i++){
 730         dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
 731     }
 732
 733     if(mirror_right){
 734         dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
 735     }
 736 }
 737
 738 #ifndef lift5
 739 static av_always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 740     const int mirror_left= !highpass;
 741     const int mirror_right= (width&1) ^ highpass;
 742     const int w= (width>>1) - 1 + (highpass & width);
 743     int i;
 744
 745     if(mirror_left){
 746         int r= 3*2*ref[0];
 747         r += r>>4;
 748         r += r>>8;
 749         dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
 750         dst += dst_step;
 751         src += src_step;
 752     }
 753
 754     for(i=0; i<w; i++){
 755         int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
 756         r += r>>4;
 757         r += r>>8;
 758         dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
 759     }
 760
 761     if(mirror_right){
 762         int r= 3*2*ref[w*ref_step];
 763         r += r>>4;
 764         r += r>>8;
 765         dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
 766     }
 767 }
 768 #endif
 769
 770 #ifndef liftS
 771 static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 772     const int mirror_left= !highpass;
 773     const int mirror_right= (width&1) ^ highpass;
 774     const int w= (width>>1) - 1 + (highpass & width);
 775     int i;
 776
 777     assert(shift == 4);
 778 #define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
 779     if(mirror_left){
 780         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 781         dst += dst_step;
 782         src += src_step;
 783     }
 784
 785     for(i=0; i<w; i++){
 786         dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
 787     }
 788
 789     if(mirror_right){
 790         dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 791     }
 792 }
 793 #endif
 794
 795
 796 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
 797     int x, i;
 798
 799     for(x=start; x<width; x+=2){
 800         int64_t sum=0;
 801
 802         for(i=0; i<n; i++){
 803             int x2= x + 2*i - n + 1;
 804             if     (x2<     0) x2= -x2;
 805             else if(x2>=width) x2= 2*width-x2-2;
 806             sum += coeffs[i]*(int64_t)dst[x2];
 807         }
 808         if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
 809         else        dst[x] += (sum + (1<<shift)/2)>>shift;
 810     }
 811 }
 812
 813 static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
 814     int x, y, i;
 815     for(y=start; y<height; y+=2){
 816         for(x=0; x<width; x++){
 817             int64_t sum=0;
 818
 819             for(i=0; i<n; i++){
 820                 int y2= y + 2*i - n + 1;
 821                 if     (y2<      0) y2= -y2;
 822                 else if(y2>=height) y2= 2*height-y2-2;
 823                 sum += coeffs[i]*(int64_t)dst[x + y2*stride];
 824             }
 825             if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
 826             else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
 827         }
 828     }
 829 }
 830
 831 #define SCALEX 1
 832 #define LX0 0
 833 #define LX1 1
 834
 835 #if 0 // more accurate 9/7
 836 #define N1 2
 837 #define SHIFT1 14
 838 #define COEFFS1 (int[]){-25987,-25987}
 839 #define N2 2
 840 #define SHIFT2 19
 841 #define COEFFS2 (int[]){-27777,-27777}
 842 #define N3 2
 843 #define SHIFT3 15
 844 #define COEFFS3 (int[]){28931,28931}
 845 #define N4 2
 846 #define SHIFT4 15
 847 #define COEFFS4 (int[]){14533,14533}
 848 #elif 1 // 13/7 CRF
 849 #define N1 4
 850 #define SHIFT1 4
 851 #define COEFFS1 (int[]){1,-9,-9,1}
 852 #define N2 4
 853 #define SHIFT2 4
 854 #define COEFFS2 (int[]){-1,5,5,-1}
 855 #define N3 0
 856 #define SHIFT3 1
 857 #define COEFFS3 NULL
 858 #define N4 0
 859 #define SHIFT4 1
 860 #define COEFFS4 NULL
 861 #elif 1 // 3/5
 862 #define LX0 1
 863 #define LX1 0
 864 #define SCALEX 0.5
 865 #define N1 2
 866 #define SHIFT1 1
 867 #define COEFFS1 (int[]){1,1}
 868 #define N2 2
 869 #define SHIFT2 2
 870 #define COEFFS2 (int[]){-1,-1}
 871 #define N3 0
 872 #define SHIFT3 0
 873 #define COEFFS3 NULL
 874 #define N4 0
 875 #define SHIFT4 0
 876 #define COEFFS4 NULL
 877 #elif 1 // 11/5
 878 #define N1 0
 879 #define SHIFT1 1
 880 #define COEFFS1 NULL
 881 #define N2 2
 882 #define SHIFT2 2
 883 #define COEFFS2 (int[]){-1,-1}
 884 #define N3 2
 885 #define SHIFT3 0
 886 #define COEFFS3 (int[]){-1,-1}
 887 #define N4 4
 888 #define SHIFT4 7
 889 #define COEFFS4 (int[]){-5,29,29,-5}
 890 #define SCALEX 4
 891 #elif 1 // 9/7 CDF
 892 #define N1 2
 893 #define SHIFT1 7
 894 #define COEFFS1 (int[]){-203,-203}
 895 #define N2 2
 896 #define SHIFT2 12
 897 #define COEFFS2 (int[]){-217,-217}
 898 #define N3 2
 899 #define SHIFT3 7
 900 #define COEFFS3 (int[]){113,113}
 901 #define N4 2
 902 #define SHIFT4 9
 903 #define COEFFS4 (int[]){227,227}
 904 #define SCALEX 1
 905 #elif 1 // 7/5 CDF
 906 #define N1 0
 907 #define SHIFT1 1
 908 #define COEFFS1 NULL
 909 #define N2 2
 910 #define SHIFT2 2
 911 #define COEFFS2 (int[]){-1,-1}
 912 #define N3 2
 913 #define SHIFT3 0
 914 #define COEFFS3 (int[]){-1,-1}
 915 #define N4 2
 916 #define SHIFT4 4
 917 #define COEFFS4 (int[]){3,3}
 918 #elif 1 // 9/7 MN
 919 #define N1 4
 920 #define SHIFT1 4
 921 #define COEFFS1 (int[]){1,-9,-9,1}
 922 #define N2 2
 923 #define SHIFT2 2
 924 #define COEFFS2 (int[]){1,1}
 925 #define N3 0
 926 #define SHIFT3 1
 927 #define COEFFS3 NULL
 928 #define N4 0
 929 #define SHIFT4 1
 930 #define COEFFS4 NULL
 931 #else // 13/7 CRF
 932 #define N1 4
 933 #define SHIFT1 4
 934 #define COEFFS1 (int[]){1,-9,-9,1}
 935 #define N2 4
 936 #define SHIFT2 4
 937 #define COEFFS2 (int[]){-1,5,5,-1}
 938 #define N3 0
 939 #define SHIFT3 1
 940 #define COEFFS3 NULL
 941 #define N4 0
 942 #define SHIFT4 1
 943 #define COEFFS4 NULL
 944 #endif
 945 static void horizontal_decomposeX(DWTELEM *b, int width){
 946     DWTELEM temp[width];
 947     const int width2= width>>1;
 948     const int w2= (width+1)>>1;
 949     int x;
 950
 951     inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
 952     inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
 953     inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
 954     inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
 955
 956     for(x=0; x<width2; x++){
 957         temp[x   ]= b[2*x    ];
 958         temp[x+w2]= b[2*x + 1];
 959     }
 960     if(width&1)
 961         temp[x   ]= b[2*x    ];
 962     memcpy(b, temp, width*sizeof(int));
 963 }
 964
 965 static void horizontal_composeX(DWTELEM *b, int width){
 966     DWTELEM temp[width];
 967     const int width2= width>>1;
 968     int x;
 969     const int w2= (width+1)>>1;
 970
 971     memcpy(temp, b, width*sizeof(int));
 972     for(x=0; x<width2; x++){
 973         b[2*x    ]= temp[x   ];
 974         b[2*x + 1]= temp[x+w2];
 975     }
 976     if(width&1)
 977         b[2*x    ]= temp[x   ];
 978
 979     inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
 980     inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
 981     inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
 982     inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
 983 }
 984
 985 static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
 986     int x, y;
 987
 988     for(y=0; y<height; y++){
 989         for(x=0; x<width; x++){
 990             buffer[y*stride + x] *= SCALEX;
 991         }
 992     }
 993
 994     for(y=0; y<height; y++){
 995         horizontal_decomposeX(buffer + y*stride, width);
 996     }
 997
 998     inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
 999     inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
1000     inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
1001     inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
1002 }
1003
1004 static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
1005     int x, y;
1006
1007     inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1008     inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1009     inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1010     inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1011
1012     for(y=0; y<height; y++){
1013         horizontal_composeX(buffer + y*stride, width);
1014     }
1015
1016     for(y=0; y<height; y++){
1017         for(x=0; x<width; x++){
1018             buffer[y*stride + x] /= SCALEX;
1019         }
1020     }
1021 }
1022
1023 static void horizontal_decompose53i(DWTELEM *b, int width){
1024     DWTELEM temp[width];
1025     const int width2= width>>1;
1026     int x;
1027     const int w2= (width+1)>>1;
1028
1029     for(x=0; x<width2; x++){
1030         temp[x   ]= b[2*x    ];
1031         temp[x+w2]= b[2*x + 1];
1032     }
1033     if(width&1)
1034         temp[x   ]= b[2*x    ];
1035 #if 0
1036     {
1037     int A1,A2,A3,A4;
1038     A2= temp[1       ];
1039     A4= temp[0       ];
1040     A1= temp[0+width2];
1041     A1 -= (A2 + A4)>>1;
1042     A4 += (A1 + 1)>>1;
1043     b[0+width2] = A1;
1044     b[0       ] = A4;
1045     for(x=1; x+1<width2; x+=2){
1046         A3= temp[x+width2];
1047         A4= temp[x+1     ];
1048         A3 -= (A2 + A4)>>1;
1049         A2 += (A1 + A3 + 2)>>2;
1050         b[x+width2] = A3;
1051         b[x       ] = A2;
1052
1053         A1= temp[x+1+width2];
1054         A2= temp[x+2       ];
1055         A1 -= (A2 + A4)>>1;
1056         A4 += (A1 + A3 + 2)>>2;
1057         b[x+1+width2] = A1;
1058         b[x+1       ] = A4;
1059     }
1060     A3= temp[width-1];
1061     A3 -= A2;
1062     A2 += (A1 + A3 + 2)>>2;
1063     b[width -1] = A3;
1064     b[width2-1] = A2;
1065     }
1066 #else
1067     lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1068     lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1069 #endif
1070 }
1071
1072 static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1073     int i;
1074
1075     for(i=0; i<width; i++){
1076         b1[i] -= (b0[i] + b2[i])>>1;
1077     }
1078 }
1079
1080 static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1081     int i;
1082
1083     for(i=0; i<width; i++){
1084         b1[i] += (b0[i] + b2[i] + 2)>>2;
1085     }
1086 }
1087
1088 static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1089     int y;
1090     DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1091     DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1092
1093     for(y=-2; y<height; y+=2){
1094         DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1095         DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1096
1097 {START_TIMER
1098         if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1099         if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1100 STOP_TIMER("horizontal_decompose53i")}
1101
1102 {START_TIMER
1103         if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1104         if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1105 STOP_TIMER("vertical_decompose53i*")}
1106
1107         b0=b2;
1108         b1=b3;
1109     }
1110 }
1111
1112 static void horizontal_decompose97i(DWTELEM *b, int width){
1113     DWTELEM temp[width];
1114     const int w2= (width+1)>>1;
1115
1116     lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1117     liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1118     lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1119     lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1120 }
1121
1122
1123 static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1124     int i;
1125
1126     for(i=0; i<width; i++){
1127         b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1128     }
1129 }
1130
1131 static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1132     int i;
1133
1134     for(i=0; i<width; i++){
1135 #ifdef lift5
1136         b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1137 #else
1138         int r= 3*(b0[i] + b2[i]);
1139         r+= r>>4;
1140         r+= r>>8;
1141         b1[i] += (r+W_CO)>>W_CS;
1142 #endif
1143     }
1144 }
1145
1146 static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1147     int i;
1148
1149     for(i=0; i<width; i++){
1150 #ifdef liftS
1151         b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1152 #else
1153         b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1154 #endif
1155     }
1156 }
1157
1158 static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1159     int i;
1160
1161     for(i=0; i<width; i++){
1162         b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1163     }
1164 }
1165
1166 static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1167     int y;
1168     DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1169     DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1170     DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1171     DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1172
1173     for(y=-4; y<height; y+=2){
1174         DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1175         DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1176
1177 {START_TIMER
1178         if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1179         if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1180 if(width>400){
1181 STOP_TIMER("horizontal_decompose97i")
1182 }}
1183
1184 {START_TIMER
1185         if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1186         if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1187         if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1188         if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1189
1190 if(width>400){
1191 STOP_TIMER("vertical_decompose97i")
1192 }}
1193
1194         b0=b2;
1195         b1=b3;
1196         b2=b4;
1197         b3=b5;
1198     }
1199 }
1200
1201 void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1202     int level;
1203
1204     for(level=0; level<decomposition_count; level++){
1205         switch(type){
1206         case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1207         case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1208         case DWT_X: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1209         }
1210     }
1211 }
1212
1213 static void horizontal_compose53i(DWTELEM *b, int width){
1214     DWTELEM temp[width];
1215     const int width2= width>>1;
1216     const int w2= (width+1)>>1;
1217     int x;
1218
1219 #if 0
1220     int A1,A2,A3,A4;
1221     A2= temp[1       ];
1222     A4= temp[0       ];
1223     A1= temp[0+width2];
1224     A1 -= (A2 + A4)>>1;
1225     A4 += (A1 + 1)>>1;
1226     b[0+width2] = A1;
1227     b[0       ] = A4;
1228     for(x=1; x+1<width2; x+=2){
1229         A3= temp[x+width2];
1230         A4= temp[x+1     ];
1231         A3 -= (A2 + A4)>>1;
1232         A2 += (A1 + A3 + 2)>>2;
1233         b[x+width2] = A3;
1234         b[x       ] = A2;
1235
1236         A1= temp[x+1+width2];
1237         A2= temp[x+2       ];
1238         A1 -= (A2 + A4)>>1;
1239         A4 += (A1 + A3 + 2)>>2;
1240         b[x+1+width2] = A1;
1241         b[x+1       ] = A4;
1242     }
1243     A3= temp[width-1];
1244     A3 -= A2;
1245     A2 += (A1 + A3 + 2)>>2;
1246     b[width -1] = A3;
1247     b[width2-1] = A2;
1248 #else
1249     lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1250     lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1251 #endif
1252     for(x=0; x<width2; x++){
1253         b[2*x    ]= temp[x   ];
1254         b[2*x + 1]= temp[x+w2];
1255     }
1256     if(width&1)
1257         b[2*x    ]= temp[x   ];
1258 }
1259
1260 static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1261     int i;
1262
1263     for(i=0; i<width; i++){
1264         b1[i] += (b0[i] + b2[i])>>1;
1265     }
1266 }
1267
1268 static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1269     int i;
1270
1271     for(i=0; i<width; i++){
1272         b1[i] -= (b0[i] + b2[i] + 2)>>2;
1273     }
1274 }
1275
1276 static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1277     cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1278     cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1279     cs->y = -1;
1280 }
1281
1282 static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1283     cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1284     cs->b1 = buffer + mirror(-1  , height-1)*stride;
1285     cs->y = -1;
1286 }
1287
1288 static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1289     int y= cs->y;
1290
1291     DWTELEM *b0= cs->b0;
1292     DWTELEM *b1= cs->b1;
1293     DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1294     DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1295
1296 {START_TIMER
1297         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1298         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1299 STOP_TIMER("vertical_compose53i*")}
1300
1301 {START_TIMER
1302         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1303         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1304 STOP_TIMER("horizontal_compose53i")}
1305
1306     cs->b0 = b2;
1307     cs->b1 = b3;
1308     cs->y += 2;
1309 }
1310
1311 static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1312     int y= cs->y;
1313     DWTELEM *b0= cs->b0;
1314     DWTELEM *b1= cs->b1;
1315     DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1316     DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1317
1318 {START_TIMER
1319         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1320         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1321 STOP_TIMER("vertical_compose53i*")}
1322
1323 {START_TIMER
1324         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1325         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1326 STOP_TIMER("horizontal_compose53i")}
1327
1328     cs->b0 = b2;
1329     cs->b1 = b3;
1330     cs->y += 2;
1331 }
1332
1333 static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1334     dwt_compose_t cs;
1335     spatial_compose53i_init(&cs, buffer, height, stride);
1336     while(cs.y <= height)
1337         spatial_compose53i_dy(&cs, buffer, width, height, stride);
1338 }
1339
1340
1341 void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1342     DWTELEM temp[width];
1343     const int w2= (width+1)>>1;
1344
1345     lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1346     lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1347     liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1348     lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1349 }
1350
1351 static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1352     int i;
1353
1354     for(i=0; i<width; i++){
1355         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1356     }
1357 }
1358
1359 static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1360     int i;
1361
1362     for(i=0; i<width; i++){
1363 #ifdef lift5
1364         b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1365 #else
1366         int r= 3*(b0[i] + b2[i]);
1367         r+= r>>4;
1368         r+= r>>8;
1369         b1[i] -= (r+W_CO)>>W_CS;
1370 #endif
1371     }
1372 }
1373
1374 static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1375     int i;
1376
1377     for(i=0; i<width; i++){
1378 #ifdef liftS
1379         b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1380 #else
1381         b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1382 #endif
1383     }
1384 }
1385
1386 static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1387     int i;
1388
1389     for(i=0; i<width; i++){
1390         b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1391     }
1392 }
1393
1394 void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1395     int i;
1396
1397     for(i=0; i<width; i++){
1398 #ifndef lift5
1399         int r;
1400 #endif
1401         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1402 #ifdef lift5
1403         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1404 #else
1405         r= 3*(b2[i] + b4[i]);
1406         r+= r>>4;
1407         r+= r>>8;
1408         b3[i] -= (r+W_CO)>>W_CS;
1409 #endif
1410 #ifdef liftS
1411         b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1412 #else
1413         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1414 #endif
1415         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1416     }
1417 }
1418
1419 static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1420     cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1421     cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1422     cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1423     cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1424     cs->y = -3;
1425 }
1426
1427 static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1428     cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1429     cs->b1 = buffer + mirror(-3  , height-1)*stride;
1430     cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1431     cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1432     cs->y = -3;
1433 }
1434
1435 static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1436     int y = cs->y;
1437
1438     DWTELEM *b0= cs->b0;
1439     DWTELEM *b1= cs->b1;
1440     DWTELEM *b2= cs->b2;
1441     DWTELEM *b3= cs->b3;
1442     DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1443     DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1444
1445 {START_TIMER
1446     if(y>0 && y+4<height){
1447         dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1448     }else{
1449         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1450         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1451         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1452         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1453     }
1454 if(width>400){
1455 STOP_TIMER("vertical_compose97i")}}
1456
1457 {START_TIMER
1458         if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1459         if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1460 if(width>400 && y+0<(unsigned)height){
1461 STOP_TIMER("horizontal_compose97i")}}
1462
1463     cs->b0=b2;
1464     cs->b1=b3;
1465     cs->b2=b4;
1466     cs->b3=b5;
1467     cs->y += 2;
1468 }
1469
1470 static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1471     int y = cs->y;
1472     DWTELEM *b0= cs->b0;
1473     DWTELEM *b1= cs->b1;
1474     DWTELEM *b2= cs->b2;
1475     DWTELEM *b3= cs->b3;
1476     DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1477     DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1478
1479 {START_TIMER
1480         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1481         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1482         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1483         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1484 if(width>400){
1485 STOP_TIMER("vertical_compose97i")}}
1486
1487 {START_TIMER
1488         if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1489         if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1490 if(width>400 && b0 <= b2){
1491 STOP_TIMER("horizontal_compose97i")}}
1492
1493     cs->b0=b2;
1494     cs->b1=b3;
1495     cs->b2=b4;
1496     cs->b3=b5;
1497     cs->y += 2;
1498 }
1499
1500 static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1501     dwt_compose_t cs;
1502     spatial_compose97i_init(&cs, buffer, height, stride);
1503     while(cs.y <= height)
1504         spatial_compose97i_dy(&cs, buffer, width, height, stride);
1505 }
1506
1507 static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1508     int level;
1509     for(level=decomposition_count-1; level>=0; level--){
1510         switch(type){
1511         case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1512         case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1513         /* not slicified yet */
1514         case DWT_X: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1515           av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1516         }
1517     }
1518 }
1519
1520 static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1521     int level;
1522     for(level=decomposition_count-1; level>=0; level--){
1523         switch(type){
1524         case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1525         case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1526         /* not slicified yet */
1527         case DWT_X: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1528         }
1529     }
1530 }
1531
1532 static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1533     const int support = type==1 ? 3 : 5;
1534     int level;
1535     if(type==2) return;
1536
1537     for(level=decomposition_count-1; level>=0; level--){
1538         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1539             switch(type){
1540             case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1541                     break;
1542             case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1543                     break;
1544             case DWT_X: break;
1545             }
1546         }
1547     }
1548 }
1549
1550 static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1551     const int support = type==1 ? 3 : 5;
1552     int level;
1553     if(type==2) return;
1554
1555     for(level=decomposition_count-1; level>=0; level--){
1556         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1557             switch(type){
1558             case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1559                     break;
1560             case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1561                     break;
1562             case DWT_X: break;
1563             }
1564         }
1565     }
1566 }
1567
1568 static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1569     if(type==2){
1570         int level;
1571         for(level=decomposition_count-1; level>=0; level--)
1572             spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1573     }else{
1574         dwt_compose_t cs[MAX_DECOMPOSITIONS];
1575         int y;
1576         ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1577         for(y=0; y<height; y+=4)
1578             ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1579     }
1580 }
1581
1582 static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1583     const int w= b->width;
1584     const int h= b->height;
1585     int x, y;
1586
1587     if(1){
1588         int run=0;
1589         int runs[w*h];
1590         int run_index=0;
1591         int max_index;
1592
1593         for(y=0; y<h; y++){
1594             for(x=0; x<w; x++){
1595                 int v, p=0;
1596                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1597                 v= src[x + y*stride];
1598
1599                 if(y){
1600                     t= src[x + (y-1)*stride];
1601                     if(x){
1602                         lt= src[x - 1 + (y-1)*stride];
1603                     }
1604                     if(x + 1 < w){
1605                         rt= src[x + 1 + (y-1)*stride];
1606                     }
1607                 }
1608                 if(x){
1609                     l= src[x - 1 + y*stride];
1610                     /*if(x > 1){
1611                         if(orientation==1) ll= src[y + (x-2)*stride];
1612                         else               ll= src[x - 2 + y*stride];
1613                     }*/
1614                 }
1615                 if(parent){
1616                     int px= x>>1;
1617                     int py= y>>1;
1618                     if(px<b->parent->width && py<b->parent->height)
1619                         p= parent[px + py*2*stride];
1620                 }
1621                 if(!(/*ll|*/l|lt|t|rt|p)){
1622                     if(v){
1623                         runs[run_index++]= run;
1624                         run=0;
1625                     }else{
1626                         run++;
1627                     }
1628                 }
1629             }
1630         }
1631         max_index= run_index;
1632         runs[run_index++]= run;
1633         run_index=0;
1634         run= runs[run_index++];
1635
1636         put_symbol2(&s->c, b->state[30], max_index, 0);
1637         if(run_index <= max_index)
1638             put_symbol2(&s->c, b->state[1], run, 3);
1639
1640         for(y=0; y<h; y++){
1641             if(s->c.bytestream_end - s->c.bytestream < w*40){
1642                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1643                 return -1;
1644             }
1645             for(x=0; x<w; x++){
1646                 int v, p=0;
1647                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1648                 v= src[x + y*stride];
1649
1650                 if(y){
1651                     t= src[x + (y-1)*stride];
1652                     if(x){
1653                         lt= src[x - 1 + (y-1)*stride];
1654                     }
1655                     if(x + 1 < w){
1656                         rt= src[x + 1 + (y-1)*stride];
1657                     }
1658                 }
1659                 if(x){
1660                     l= src[x - 1 + y*stride];
1661                     /*if(x > 1){
1662                         if(orientation==1) ll= src[y + (x-2)*stride];
1663                         else               ll= src[x - 2 + y*stride];
1664                     }*/
1665                 }
1666                 if(parent){
1667                     int px= x>>1;
1668                     int py= y>>1;
1669                     if(px<b->parent->width && py<b->parent->height)
1670                         p= parent[px + py*2*stride];
1671                 }
1672                 if(/*ll|*/l|lt|t|rt|p){
1673                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1674
1675                     put_rac(&s->c, &b->state[0][context], !!v);
1676                 }else{
1677                     if(!run){
1678                         run= runs[run_index++];
1679
1680                         if(run_index <= max_index)
1681                             put_symbol2(&s->c, b->state[1], run, 3);
1682                         assert(v);
1683                     }else{
1684                         run--;
1685                         assert(!v);
1686                     }
1687                 }
1688                 if(v){
1689                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1690                     int l2= 2*FFABS(l) + (l<0);
1691                     int t2= 2*FFABS(t) + (t<0);
1692
1693                     put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1694                     put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1695                 }
1696             }
1697         }
1698     }
1699     return 0;
1700 }
1701
1702 static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1703 //    encode_subband_qtree(s, b, src, parent, stride, orientation);
1704 //    encode_subband_z0run(s, b, src, parent, stride, orientation);
1705     return encode_subband_c0run(s, b, src, parent, stride, orientation);
1706 //    encode_subband_dzr(s, b, src, parent, stride, orientation);
1707 }
1708
1709 static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1710     const int w= b->width;
1711     const int h= b->height;
1712     int x,y;
1713
1714     if(1){
1715         int run, runs;
1716         x_and_coeff *xc= b->x_coeff;
1717         x_and_coeff *prev_xc= NULL;
1718         x_and_coeff *prev2_xc= xc;
1719         x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1720         x_and_coeff *prev_parent_xc= parent_xc;
1721
1722         runs= get_symbol2(&s->c, b->state[30], 0);
1723         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1724         else           run= INT_MAX;
1725
1726         for(y=0; y<h; y++){
1727             int v=0;
1728             int lt=0, t=0, rt=0;
1729
1730             if(y && prev_xc->x == 0){
1731                 rt= prev_xc->coeff;
1732             }
1733             for(x=0; x<w; x++){
1734                 int p=0;
1735                 const int l= v;
1736
1737                 lt= t; t= rt;
1738
1739                 if(y){
1740                     if(prev_xc->x <= x)
1741                         prev_xc++;
1742                     if(prev_xc->x == x + 1)
1743                         rt= prev_xc->coeff;
1744                     else
1745                         rt=0;
1746                 }
1747                 if(parent_xc){
1748                     if(x>>1 > parent_xc->x){
1749                         parent_xc++;
1750                     }
1751                     if(x>>1 == parent_xc->x){
1752                         p= parent_xc->coeff;
1753                     }
1754                 }
1755                 if(/*ll|*/l|lt|t|rt|p){
1756                     int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1757
1758                     v=get_rac(&s->c, &b->state[0][context]);
1759                     if(v){
1760                         v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1761                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1762
1763                         xc->x=x;
1764                         (xc++)->coeff= v;
1765                     }
1766                 }else{
1767                     if(!run){
1768                         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1769                         else           run= INT_MAX;
1770                         v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1771                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1772
1773                         xc->x=x;
1774                         (xc++)->coeff= v;
1775                     }else{
1776                         int max_run;
1777                         run--;
1778                         v=0;
1779
1780                         if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1781                         else  max_run= FFMIN(run, w-x-1);
1782                         if(parent_xc)
1783                             max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1784                         x+= max_run;
1785                         run-= max_run;
1786                     }
1787                 }
1788             }
1789             (xc++)->x= w+1; //end marker
1790             prev_xc= prev2_xc;
1791             prev2_xc= xc;
1792
1793             if(parent_xc){
1794                 if(y&1){
1795                     while(parent_xc->x != parent->width+1)
1796                         parent_xc++;
1797                     parent_xc++;
1798                     prev_parent_xc= parent_xc;
1799                 }else{
1800                     parent_xc= prev_parent_xc;
1801                 }
1802             }
1803         }
1804
1805         (xc++)->x= w+1; //end marker
1806     }
1807 }
1808
1809 static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1810     const int w= b->width;
1811     int y;
1812     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
1813     int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1814     int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1815     int new_index = 0;
1816
1817     START_TIMER
1818
1819     if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1820         qadd= 0;
1821         qmul= 1<<QEXPSHIFT;
1822     }
1823
1824     /* If we are on the second or later slice, restore our index. */
1825     if (start_y != 0)
1826         new_index = save_state[0];
1827
1828
1829     for(y=start_y; y<h; y++){
1830         int x = 0;
1831         int v;
1832         DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1833         memset(line, 0, b->width*sizeof(DWTELEM));
1834         v = b->x_coeff[new_index].coeff;
1835         x = b->x_coeff[new_index++].x;
1836         while(x < w)
1837         {
1838             register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1839             register int u= -(v&1);
1840             line[x] = (t^u) - u;
1841
1842             v = b->x_coeff[new_index].coeff;
1843             x = b->x_coeff[new_index++].x;
1844         }
1845     }
1846     if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1847         STOP_TIMER("decode_subband")
1848     }
1849
1850     /* Save our variables for the next slice. */
1851     save_state[0] = new_index;
1852
1853     return;
1854 }
1855
1856 static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
1857     int plane_index, level, orientation;
1858
1859     for(plane_index=0; plane_index<3; plane_index++){
1860         for(level=0; level<s->spatial_decomposition_count; level++){
1861             for(orientation=level ? 1:0; orientation<4; orientation++){
1862                 memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1863             }
1864         }
1865     }
1866     memset(s->header_state, MID_STATE, sizeof(s->header_state));
1867     memset(s->block_state, MID_STATE, sizeof(s->block_state));
1868 }
1869
1870 static int alloc_blocks(SnowContext *s){
1871     int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1872     int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1873
1874     s->b_width = w;
1875     s->b_height= h;
1876
1877     s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1878     return 0;
1879 }
1880
1881 static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1882     uint8_t *bytestream= d->bytestream;
1883     uint8_t *bytestream_start= d->bytestream_start;
1884     *d= *s;
1885     d->bytestream= bytestream;
1886     d->bytestream_start= bytestream_start;
1887 }
1888
1889 //near copy & paste from dsputil, FIXME
1890 static int pix_sum(uint8_t * pix, int line_size, int w)
1891 {
1892     int s, i, j;
1893
1894     s = 0;
1895     for (i = 0; i < w; i++) {
1896         for (j = 0; j < w; j++) {
1897             s += pix[0];
1898             pix ++;
1899         }
1900         pix += line_size - w;
1901     }
1902     return s;
1903 }
1904
1905 //near copy & paste from dsputil, FIXME
1906 static int pix_norm1(uint8_t * pix, int line_size, int w)
1907 {
1908     int s, i, j;
1909     uint32_t *sq = ff_squareTbl + 256;
1910
1911     s = 0;
1912     for (i = 0; i < w; i++) {
1913         for (j = 0; j < w; j ++) {
1914             s += sq[pix[0]];
1915             pix ++;
1916         }
1917         pix += line_size - w;
1918     }
1919     return s;
1920 }
1921
1922 static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1923     const int w= s->b_width << s->block_max_depth;
1924     const int rem_depth= s->block_max_depth - level;
1925     const int index= (x + y*w) << rem_depth;
1926     const int block_w= 1<<rem_depth;
1927     BlockNode block;
1928     int i,j;
1929
1930     block.color[0]= l;
1931     block.color[1]= cb;
1932     block.color[2]= cr;
1933     block.mx= mx;
1934     block.my= my;
1935     block.ref= ref;
1936     block.type= type;
1937     block.level= level;
1938
1939     for(j=0; j<block_w; j++){
1940         for(i=0; i<block_w; i++){
1941             s->block[index + i + j*w]= block;
1942         }
1943     }
1944 }
1945
1946 static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1947     const int offset[3]= {
1948           y*c->  stride + x,
1949         ((y*c->uvstride + x)>>1),
1950         ((y*c->uvstride + x)>>1),
1951     };
1952     int i;
1953     for(i=0; i<3; i++){
1954         c->src[0][i]= src [i];
1955         c->ref[0][i]= ref [i] + offset[i];
1956     }
1957     assert(!ref_index);
1958 }
1959
1960 static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1961                            const BlockNode *left, const BlockNode *top, const BlockNode *tr){
1962     if(s->ref_frames == 1){
1963         *mx = mid_pred(left->mx, top->mx, tr->mx);
1964         *my = mid_pred(left->my, top->my, tr->my);
1965     }else{
1966         const int *scale = scale_mv_ref[ref];
1967         *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
1968                        (top ->mx * scale[top ->ref] + 128) >>8,
1969                        (tr  ->mx * scale[tr  ->ref] + 128) >>8);
1970         *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
1971                        (top ->my * scale[top ->ref] + 128) >>8,
1972                        (tr  ->my * scale[tr  ->ref] + 128) >>8);
1973     }
1974 }
1975
1976 //FIXME copy&paste
1977 #define P_LEFT P[1]
1978 #define P_TOP P[2]
1979 #define P_TOPRIGHT P[3]
1980 #define P_MEDIAN P[4]
1981 #define P_MV1 P[9]
1982 #define FLAG_QPEL   1 //must be 1
1983
1984 static int encode_q_branch(SnowContext *s, int level, int x, int y){
1985     uint8_t p_buffer[1024];
1986     uint8_t i_buffer[1024];
1987     uint8_t p_state[sizeof(s->block_state)];
1988     uint8_t i_state[sizeof(s->block_state)];
1989     RangeCoder pc, ic;
1990     uint8_t *pbbak= s->c.bytestream;
1991     uint8_t *pbbak_start= s->c.bytestream_start;
1992     int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
1993     const int w= s->b_width  << s->block_max_depth;
1994     const int h= s->b_height << s->block_max_depth;
1995     const int rem_depth= s->block_max_depth - level;
1996     const int index= (x + y*w) << rem_depth;
1997     const int block_w= 1<<(LOG2_MB_SIZE - level);
1998     int trx= (x+1)<<rem_depth;
1999     int try= (y+1)<<rem_depth;
2000     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2001     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2002     const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
2003     const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
2004     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2005     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2006     int pl = left->color[0];
2007     int pcb= left->color[1];
2008     int pcr= left->color[2];
2009     int pmx, pmy;
2010     int mx=0, my=0;
2011     int l,cr,cb;
2012     const int stride= s->current_picture.linesize[0];
2013     const int uvstride= s->current_picture.linesize[1];
2014     uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
2015                                 s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
2016                                 s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
2017     int P[10][2];
2018     int16_t last_mv[3][2];
2019     int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
2020     const int shift= 1+qpel;
2021     MotionEstContext *c= &s->m.me;
2022     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2023     int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2024     int my_context= av_log2(2*FFABS(left->my - top->my));
2025     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2026     int ref, best_ref, ref_score, ref_mx, ref_my;
2027
2028     assert(sizeof(s->block_state) >= 256);
2029     if(s->keyframe){
2030         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2031         return 0;
2032     }
2033
2034 //    clip predictors / edge ?
2035
2036     P_LEFT[0]= left->mx;
2037     P_LEFT[1]= left->my;
2038     P_TOP [0]= top->mx;
2039     P_TOP [1]= top->my;
2040     P_TOPRIGHT[0]= tr->mx;
2041     P_TOPRIGHT[1]= tr->my;
2042
2043     last_mv[0][0]= s->block[index].mx;
2044     last_mv[0][1]= s->block[index].my;
2045     last_mv[1][0]= right->mx;
2046     last_mv[1][1]= right->my;
2047     last_mv[2][0]= bottom->mx;
2048     last_mv[2][1]= bottom->my;
2049
2050     s->m.mb_stride=2;
2051     s->m.mb_x=
2052     s->m.mb_y= 0;
2053     c->skip= 0;
2054
2055     assert(c->  stride ==   stride);
2056     assert(c->uvstride == uvstride);
2057
2058     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2059     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2060     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2061     c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2062
2063     c->xmin = - x*block_w - 16+2;
2064     c->ymin = - y*block_w - 16+2;
2065     c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2066     c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2067
2068     if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2069     if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2070     if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2071     if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2072     if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2073     if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2074     if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2075
2076     P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2077     P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2078
2079     if (!y) {
2080         c->pred_x= P_LEFT[0];
2081         c->pred_y= P_LEFT[1];
2082     } else {
2083         c->pred_x = P_MEDIAN[0];
2084         c->pred_y = P_MEDIAN[1];
2085     }
2086
2087     score= INT_MAX;
2088     best_ref= 0;
2089     for(ref=0; ref<s->ref_frames; ref++){
2090         init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2091
2092         ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2093                                          (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2094
2095         assert(ref_mx >= c->xmin);
2096         assert(ref_mx <= c->xmax);
2097         assert(ref_my >= c->ymin);
2098         assert(ref_my <= c->ymax);
2099
2100         ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2101         ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2102         ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2103         if(s->ref_mvs[ref]){
2104             s->ref_mvs[ref][index][0]= ref_mx;
2105             s->ref_mvs[ref][index][1]= ref_my;
2106             s->ref_scores[ref][index]= ref_score;
2107         }
2108         if(score > ref_score){
2109             score= ref_score;
2110             best_ref= ref;
2111             mx= ref_mx;
2112             my= ref_my;
2113         }
2114     }
2115     //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
2116
2117   //  subpel search
2118     base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
2119     pc= s->c;
2120     pc.bytestream_start=
2121     pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2122     memcpy(p_state, s->block_state, sizeof(s->block_state));
2123
2124     if(level!=s->block_max_depth)
2125         put_rac(&pc, &p_state[4 + s_context], 1);
2126     put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2127     if(s->ref_frames > 1)
2128         put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2129     pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
2130     put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2131     put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2132     p_len= pc.bytestream - pc.bytestream_start;
2133     score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
2134
2135     block_s= block_w*block_w;
2136     sum = pix_sum(current_data[0], stride, block_w);
2137     l= (sum + block_s/2)/block_s;
2138     iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2139
2140     block_s= block_w*block_w>>2;
2141     sum = pix_sum(current_data[1], uvstride, block_w>>1);
2142     cb= (sum + block_s/2)/block_s;
2143 //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2144     sum = pix_sum(current_data[2], uvstride, block_w>>1);
2145     cr= (sum + block_s/2)/block_s;
2146 //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2147
2148     ic= s->c;
2149     ic.bytestream_start=
2150     ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2151     memcpy(i_state, s->block_state, sizeof(s->block_state));
2152     if(level!=s->block_max_depth)
2153         put_rac(&ic, &i_state[4 + s_context], 1);
2154     put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2155     put_symbol(&ic, &i_state[32],  l-pl , 1);
2156     put_symbol(&ic, &i_state[64], cb-pcb, 1);
2157     put_symbol(&ic, &i_state[96], cr-pcr, 1);
2158     i_len= ic.bytestream - ic.bytestream_start;
2159     iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
2160
2161 //    assert(score==256*256*256*64-1);
2162     assert(iscore < 255*255*256 + s->lambda2*10);
2163     assert(iscore >= 0);
2164     assert(l>=0 && l<=255);
2165     assert(pl>=0 && pl<=255);
2166
2167     if(level==0){
2168         int varc= iscore >> 8;
2169         int vard= score >> 8;
2170         if (vard <= 64 || vard < varc)
2171             c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2172         else
2173             c->scene_change_score+= s->m.qscale;
2174     }
2175
2176     if(level!=s->block_max_depth){
2177         put_rac(&s->c, &s->block_state[4 + s_context], 0);
2178         score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2179         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2180         score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2181         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2182         score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2183
2184         if(score2 < score && score2 < iscore)
2185             return score2;
2186     }
2187
2188     if(iscore < score){
2189         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2190         memcpy(pbbak, i_buffer, i_len);
2191         s->c= ic;
2192         s->c.bytestream_start= pbbak_start;
2193         s->c.bytestream= pbbak + i_len;
2194         set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2195         memcpy(s->block_state, i_state, sizeof(s->block_state));
2196         return iscore;
2197     }else{
2198         memcpy(pbbak, p_buffer, p_len);
2199         s->c= pc;
2200         s->c.bytestream_start= pbbak_start;
2201         s->c.bytestream= pbbak + p_len;
2202         set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2203         memcpy(s->block_state, p_state, sizeof(s->block_state));
2204         return score;
2205     }
2206 }
2207
2208 static av_always_inline int same_block(BlockNode *a, BlockNode *b){
2209     if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2210         return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2211     }else{
2212         return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2213     }
2214 }
2215
2216 static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2217     const int w= s->b_width  << s->block_max_depth;
2218     const int rem_depth= s->block_max_depth - level;
2219     const int index= (x + y*w) << rem_depth;
2220     int trx= (x+1)<<rem_depth;
2221     BlockNode *b= &s->block[index];
2222     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2223     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2224     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2225     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2226     int pl = left->color[0];
2227     int pcb= left->color[1];
2228     int pcr= left->color[2];
2229     int pmx, pmy;
2230     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2231     int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
2232     int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
2233     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2234
2235     if(s->keyframe){
2236         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2237         return;
2238     }
2239
2240     if(level!=s->block_max_depth){
2241         if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2242             put_rac(&s->c, &s->block_state[4 + s_context], 1);
2243         }else{
2244             put_rac(&s->c, &s->block_state[4 + s_context], 0);
2245             encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2246             encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2247             encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2248             encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2249             return;
2250         }
2251     }
2252     if(b->type & BLOCK_INTRA){
2253         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2254         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2255         put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2256         put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2257         put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2258         set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2259     }else{
2260         pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2261         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2262         if(s->ref_frames > 1)
2263             put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2264         put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2265         put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2266         set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2267     }
2268 }
2269
2270 static void decode_q_branch(SnowContext *s, int level, int x, int y){
2271     const int w= s->b_width << s->block_max_depth;
2272     const int rem_depth= s->block_max_depth - level;
2273     const int index= (x + y*w) << rem_depth;
2274     int trx= (x+1)<<rem_depth;
2275     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2276     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2277     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2278     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2279     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2280
2281     if(s->keyframe){
2282         set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2283         return;
2284     }
2285
2286     if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2287         int type, mx, my;
2288         int l = left->color[0];
2289         int cb= left->color[1];
2290         int cr= left->color[2];
2291         int ref = 0;
2292         int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2293         int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2294         int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2295
2296         type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2297
2298         if(type){
2299             pred_mv(s, &mx, &my, 0, left, top, tr);
2300             l += get_symbol(&s->c, &s->block_state[32], 1);
2301             cb+= get_symbol(&s->c, &s->block_state[64], 1);
2302             cr+= get_symbol(&s->c, &s->block_state[96], 1);
2303         }else{
2304             if(s->ref_frames > 1)
2305                 ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2306             pred_mv(s, &mx, &my, ref, left, top, tr);
2307             mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2308             my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2309         }
2310         set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2311     }else{
2312         decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2313         decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2314         decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2315         decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2316     }
2317 }
2318
2319 static void encode_blocks(SnowContext *s, int search){
2320     int x, y;
2321     int w= s->b_width;
2322     int h= s->b_height;
2323
2324     if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2325         iterative_me(s);
2326
2327     for(y=0; y<h; y++){
2328         if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2329             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2330             return;
2331         }
2332         for(x=0; x<w; x++){
2333             if(s->avctx->me_method == ME_ITER || !search)
2334                 encode_q_branch2(s, 0, x, y);
2335             else
2336                 encode_q_branch (s, 0, x, y);
2337         }
2338     }
2339 }
2340
2341 static void decode_blocks(SnowContext *s){
2342     int x, y;
2343     int w= s->b_width;
2344     int h= s->b_height;
2345
2346     for(y=0; y<h; y++){
2347         for(x=0; x<w; x++){
2348             decode_q_branch(s, 0, x, y);
2349         }
2350     }
2351 }
2352
2353 static void mc_block(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2354     int x, y;
2355 START_TIMER
2356     for(y=0; y < b_h+5; y++){
2357         for(x=0; x < b_w; x++){
2358             int a0= src[x    ];
2359             int a1= src[x + 1];
2360             int a2= src[x + 2];
2361             int a3= src[x + 3];
2362             int a4= src[x + 4];
2363             int a5= src[x + 5];
2364 //            int am= 9*(a1+a2) - (a0+a3);
2365             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2366 //            int am= 18*(a2+a3) - 2*(a1+a4);
2367 //             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2368 //             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2369
2370 //            if(b_w==16) am= 8*(a1+a2);
2371
2372             if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2373             else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2374
2375             /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2376             if(am&(~255)) am= ~(am>>31);
2377
2378             tmp[x] = am;
2379
2380 /*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2381             else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2382             else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2383             else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2384         }
2385         tmp += stride;
2386         src += stride;
2387     }
2388     tmp -= (b_h+5)*stride;
2389
2390     for(y=0; y < b_h; y++){
2391         for(x=0; x < b_w; x++){
2392             int a0= tmp[x + 0*stride];
2393             int a1= tmp[x + 1*stride];
2394             int a2= tmp[x + 2*stride];
2395             int a3= tmp[x + 3*stride];
2396             int a4= tmp[x + 4*stride];
2397             int a5= tmp[x + 5*stride];
2398             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2399 //            int am= 18*(a2+a3) - 2*(a1+a4);
2400 /*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2401             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2402
2403 //            if(b_w==16) am= 8*(a1+a2);
2404
2405             if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2406             else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2407
2408             if(am&(~255)) am= ~(am>>31);
2409
2410             dst[x] = am;
2411 /*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2412             else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2413             else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2414             else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2415         }
2416         dst += stride;
2417         tmp += stride;
2418     }
2419 STOP_TIMER("mc_block")
2420 }
2421
2422 #define mca(dx,dy,b_w)\
2423 static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, const uint8_t *src, int stride, int h){\
2424     uint8_t tmp[stride*(b_w+5)];\
2425     assert(h==b_w);\
2426     mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2427 }
2428
2429 mca( 0, 0,16)
2430 mca( 8, 0,16)
2431 mca( 0, 8,16)
2432 mca( 8, 8,16)
2433 mca( 0, 0,8)
2434 mca( 8, 0,8)
2435 mca( 0, 8,8)
2436 mca( 8, 8,8)
2437
2438 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2439     if(block->type & BLOCK_INTRA){
2440         int x, y;
2441         const int color = block->color[plane_index];
2442         const int color4= color*0x01010101;
2443         if(b_w==32){
2444             for(y=0; y < b_h; y++){
2445                 *(uint32_t*)&dst[0 + y*stride]= color4;
2446                 *(uint32_t*)&dst[4 + y*stride]= color4;
2447                 *(uint32_t*)&dst[8 + y*stride]= color4;
2448                 *(uint32_t*)&dst[12+ y*stride]= color4;
2449                 *(uint32_t*)&dst[16+ y*stride]= color4;
2450                 *(uint32_t*)&dst[20+ y*stride]= color4;
2451                 *(uint32_t*)&dst[24+ y*stride]= color4;
2452                 *(uint32_t*)&dst[28+ y*stride]= color4;
2453             }
2454         }else if(b_w==16){
2455             for(y=0; y < b_h; y++){
2456                 *(uint32_t*)&dst[0 + y*stride]= color4;
2457                 *(uint32_t*)&dst[4 + y*stride]= color4;
2458                 *(uint32_t*)&dst[8 + y*stride]= color4;
2459                 *(uint32_t*)&dst[12+ y*stride]= color4;
2460             }
2461         }else if(b_w==8){
2462             for(y=0; y < b_h; y++){
2463                 *(uint32_t*)&dst[0 + y*stride]= color4;
2464                 *(uint32_t*)&dst[4 + y*stride]= color4;
2465             }
2466         }else if(b_w==4){
2467             for(y=0; y < b_h; y++){
2468                 *(uint32_t*)&dst[0 + y*stride]= color4;
2469             }
2470         }else{
2471             for(y=0; y < b_h; y++){
2472                 for(x=0; x < b_w; x++){
2473                     dst[x + y*stride]= color;
2474                 }
2475             }
2476         }
2477     }else{
2478         uint8_t *src= s->last_picture[block->ref].data[plane_index];
2479         const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2480         int mx= block->mx*scale;
2481         int my= block->my*scale;
2482         const int dx= mx&15;
2483         const int dy= my&15;
2484         const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2485         sx += (mx>>4) - 2;
2486         sy += (my>>4) - 2;
2487         src += sx + sy*stride;
2488         if(   (unsigned)sx >= w - b_w - 4
2489            || (unsigned)sy >= h - b_h - 4){
2490             ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2491             src= tmp + MB_SIZE;
2492         }
2493 //        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2494 //        assert(!(b_w&(b_w-1)));
2495         assert(b_w>1 && b_h>1);
2496         assert(tab_index>=0 && tab_index<4 || b_w==32);
2497         if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2498             mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2499         else if(b_w==32){
2500             int y;
2501             for(y=0; y<b_h; y+=16){
2502                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2503                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2504             }
2505         }else if(b_w==b_h)
2506             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2507         else if(b_w==2*b_h){
2508             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2509             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2510         }else{
2511             assert(2*b_w==b_h);
2512             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2513             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2514         }
2515     }
2516 }
2517
2518 void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2519                               int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2520     int y, x;
2521     DWTELEM * dst;
2522     for(y=0; y<b_h; y++){
2523         //FIXME ugly missue of obmc_stride
2524         const uint8_t *obmc1= obmc + y*obmc_stride;
2525         const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2526         const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2527         const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2528         dst = slice_buffer_get_line(sb, src_y + y);
2529         for(x=0; x<b_w; x++){
2530             int v=   obmc1[x] * block[3][x + y*src_stride]
2531                     +obmc2[x] * block[2][x + y*src_stride]
2532                     +obmc3[x] * block[1][x + y*src_stride]
2533                     +obmc4[x] * block[0][x + y*src_stride];
2534
2535             v <<= 8 - LOG2_OBMC_MAX;
2536             if(FRAC_BITS != 8){
2537                 v += 1<<(7 - FRAC_BITS);
2538                 v >>= 8 - FRAC_BITS;
2539             }
2540             if(add){
2541                 v += dst[x + src_x];
2542                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2543                 if(v&(~255)) v= ~(v>>31);
2544                 dst8[x + y*src_stride] = v;
2545             }else{
2546                 dst[x + src_x] -= v;
2547             }
2548         }
2549     }
2550 }
2551
2552 //FIXME name clenup (b_w, block_w, b_width stuff)
2553 static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, DWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2554     const int b_width = s->b_width  << s->block_max_depth;
2555     const int b_height= s->b_height << s->block_max_depth;
2556     const int b_stride= b_width;
2557     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2558     BlockNode *rt= lt+1;
2559     BlockNode *lb= lt+b_stride;
2560     BlockNode *rb= lb+1;
2561     uint8_t *block[4];
2562     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2563     uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2564     uint8_t *ptmp;
2565     int x,y;
2566
2567     if(b_x<0){
2568         lt= rt;
2569         lb= rb;
2570     }else if(b_x + 1 >= b_width){
2571         rt= lt;
2572         rb= lb;
2573     }
2574     if(b_y<0){
2575         lt= lb;
2576         rt= rb;
2577     }else if(b_y + 1 >= b_height){
2578         lb= lt;
2579         rb= rt;
2580     }
2581
2582     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2583         obmc -= src_x;
2584         b_w += src_x;
2585         if(!sliced && !offset_dst)
2586             dst -= src_x;
2587         src_x=0;
2588     }else if(src_x + b_w > w){
2589         b_w = w - src_x;
2590     }
2591     if(src_y<0){
2592         obmc -= src_y*obmc_stride;
2593         b_h += src_y;
2594         if(!sliced && !offset_dst)
2595             dst -= src_y*dst_stride;
2596         src_y=0;
2597     }else if(src_y + b_h> h){
2598         b_h = h - src_y;
2599     }
2600
2601     if(b_w<=0 || b_h<=0) return;
2602
2603 assert(src_stride > 2*MB_SIZE + 5);
2604     if(!sliced && offset_dst)
2605         dst += src_x + src_y*dst_stride;
2606     dst8+= src_x + src_y*src_stride;
2607 //    src += src_x + src_y*src_stride;
2608
2609     ptmp= tmp + 3*tmp_step;
2610     block[0]= ptmp;
2611     ptmp+=tmp_step;
2612     pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2613
2614     if(same_block(lt, rt)){
2615         block[1]= block[0];
2616     }else{
2617         block[1]= ptmp;
2618         ptmp+=tmp_step;
2619         pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2620     }
2621
2622     if(same_block(lt, lb)){
2623         block[2]= block[0];
2624     }else if(same_block(rt, lb)){
2625         block[2]= block[1];
2626     }else{
2627         block[2]= ptmp;
2628         ptmp+=tmp_step;
2629         pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2630     }
2631
2632     if(same_block(lt, rb) ){
2633         block[3]= block[0];
2634     }else if(same_block(rt, rb)){
2635         block[3]= block[1];
2636     }else if(same_block(lb, rb)){
2637         block[3]= block[2];
2638     }else{
2639         block[3]= ptmp;
2640         pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2641     }
2642 #if 0
2643     for(y=0; y<b_h; y++){
2644         for(x=0; x<b_w; x++){
2645             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2646             if(add) dst[x + y*dst_stride] += v;
2647             else    dst[x + y*dst_stride] -= v;
2648         }
2649     }
2650     for(y=0; y<b_h; y++){
2651         uint8_t *obmc2= obmc + (obmc_stride>>1);
2652         for(x=0; x<b_w; x++){
2653             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2654             if(add) dst[x + y*dst_stride] += v;
2655             else    dst[x + y*dst_stride] -= v;
2656         }
2657     }
2658     for(y=0; y<b_h; y++){
2659         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2660         for(x=0; x<b_w; x++){
2661             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2662             if(add) dst[x + y*dst_stride] += v;
2663             else    dst[x + y*dst_stride] -= v;
2664         }
2665     }
2666     for(y=0; y<b_h; y++){
2667         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2668         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2669         for(x=0; x<b_w; x++){
2670             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2671             if(add) dst[x + y*dst_stride] += v;
2672             else    dst[x + y*dst_stride] -= v;
2673         }
2674     }
2675 #else
2676     if(sliced){
2677         START_TIMER
2678
2679         s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2680         STOP_TIMER("inner_add_yblock")
2681     }else
2682     for(y=0; y<b_h; y++){
2683         //FIXME ugly missue of obmc_stride
2684         const uint8_t *obmc1= obmc + y*obmc_stride;
2685         const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2686         const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2687         const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2688         for(x=0; x<b_w; x++){
2689             int v=   obmc1[x] * block[3][x + y*src_stride]
2690                     +obmc2[x] * block[2][x + y*src_stride]
2691                     +obmc3[x] * block[1][x + y*src_stride]
2692                     +obmc4[x] * block[0][x + y*src_stride];
2693
2694             v <<= 8 - LOG2_OBMC_MAX;
2695             if(FRAC_BITS != 8){
2696                 v += 1<<(7 - FRAC_BITS);
2697                 v >>= 8 - FRAC_BITS;
2698             }
2699             if(add){
2700                 v += dst[x + y*dst_stride];
2701                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2702                 if(v&(~255)) v= ~(v>>31);
2703                 dst8[x + y*src_stride] = v;
2704             }else{
2705                 dst[x + y*dst_stride] -= v;
2706             }
2707         }
2708     }
2709 #endif
2710 }
2711
2712 static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2713     Plane *p= &s->plane[plane_index];
2714     const int mb_w= s->b_width  << s->block_max_depth;
2715     const int mb_h= s->b_height << s->block_max_depth;
2716     int x, y, mb_x;
2717     int block_size = MB_SIZE >> s->block_max_depth;
2718     int block_w    = plane_index ? block_size/2 : block_size;
2719     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2720     int obmc_stride= plane_index ? block_size : 2*block_size;
2721     int ref_stride= s->current_picture.linesize[plane_index];
2722     uint8_t *dst8= s->current_picture.data[plane_index];
2723     int w= p->width;
2724     int h= p->height;
2725     START_TIMER
2726
2727     if(s->keyframe || (s->avctx->debug&512)){
2728         if(mb_y==mb_h)
2729             return;
2730
2731         if(add){
2732             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2733             {
2734 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2735                 DWTELEM * line = sb->line[y];
2736                 for(x=0; x<w; x++)
2737                 {
2738 //                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2739                     int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2740                     v >>= FRAC_BITS;
2741                     if(v&(~255)) v= ~(v>>31);
2742                     dst8[x + y*ref_stride]= v;
2743                 }
2744             }
2745         }else{
2746             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2747             {
2748 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2749                 DWTELEM * line = sb->line[y];
2750                 for(x=0; x<w; x++)
2751                 {
2752                     line[x] -= 128 << FRAC_BITS;
2753 //                    buf[x + y*w]-= 128<<FRAC_BITS;
2754                 }
2755             }
2756         }
2757
2758         return;
2759     }
2760
2761         for(mb_x=0; mb_x<=mb_w; mb_x++){
2762             START_TIMER
2763
2764             add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2765                        block_w*mb_x - block_w/2,
2766                        block_w*mb_y - block_w/2,
2767                        block_w, block_w,
2768                        w, h,
2769                        w, ref_stride, obmc_stride,
2770                        mb_x - 1, mb_y - 1,
2771                        add, 0, plane_index);
2772
2773             STOP_TIMER("add_yblock")
2774         }
2775
2776     STOP_TIMER("predict_slice")
2777 }
2778
2779 static av_always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2780     Plane *p= &s->plane[plane_index];
2781     const int mb_w= s->b_width  << s->block_max_depth;
2782     const int mb_h= s->b_height << s->block_max_depth;
2783     int x, y, mb_x;
2784     int block_size = MB_SIZE >> s->block_max_depth;
2785     int block_w    = plane_index ? block_size/2 : block_size;
2786     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2787     const int obmc_stride= plane_index ? block_size : 2*block_size;
2788     int ref_stride= s->current_picture.linesize[plane_index];
2789     uint8_t *dst8= s->current_picture.data[plane_index];
2790     int w= p->width;
2791     int h= p->height;
2792     START_TIMER
2793
2794     if(s->keyframe || (s->avctx->debug&512)){
2795         if(mb_y==mb_h)
2796             return;
2797
2798         if(add){
2799             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2800                 for(x=0; x<w; x++){
2801                     int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2802                     v >>= FRAC_BITS;
2803                     if(v&(~255)) v= ~(v>>31);
2804                     dst8[x + y*ref_stride]= v;
2805                 }
2806             }
2807         }else{
2808             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2809                 for(x=0; x<w; x++){
2810                     buf[x + y*w]-= 128<<FRAC_BITS;
2811                 }
2812             }
2813         }
2814
2815         return;
2816     }
2817
2818         for(mb_x=0; mb_x<=mb_w; mb_x++){
2819             START_TIMER
2820
2821             add_yblock(s, 0, NULL, buf, dst8, obmc,
2822                        block_w*mb_x - block_w/2,
2823                        block_w*mb_y - block_w/2,
2824                        block_w, block_w,
2825                        w, h,
2826                        w, ref_stride, obmc_stride,
2827                        mb_x - 1, mb_y - 1,
2828                        add, 1, plane_index);
2829
2830             STOP_TIMER("add_yblock")
2831         }
2832
2833     STOP_TIMER("predict_slice")
2834 }
2835
2836 static av_always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2837     const int mb_h= s->b_height << s->block_max_depth;
2838     int mb_y;
2839     for(mb_y=0; mb_y<=mb_h; mb_y++)
2840         predict_slice(s, buf, plane_index, add, mb_y);
2841 }
2842
2843 static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2844     int i, x2, y2;
2845     Plane *p= &s->plane[plane_index];
2846     const int block_size = MB_SIZE >> s->block_max_depth;
2847     const int block_w    = plane_index ? block_size/2 : block_size;
2848     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2849     const int obmc_stride= plane_index ? block_size : 2*block_size;
2850     const int ref_stride= s->current_picture.linesize[plane_index];
2851     uint8_t *src= s-> input_picture.data[plane_index];
2852     DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2853     const int b_stride = s->b_width << s->block_max_depth;
2854     const int w= p->width;
2855     const int h= p->height;
2856     int index= mb_x + mb_y*b_stride;
2857     BlockNode *b= &s->block[index];
2858     BlockNode backup= *b;
2859     int ab=0;
2860     int aa=0;
2861
2862     b->type|= BLOCK_INTRA;
2863     b->color[plane_index]= 0;
2864     memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2865
2866     for(i=0; i<4; i++){
2867         int mb_x2= mb_x + (i &1) - 1;
2868         int mb_y2= mb_y + (i>>1) - 1;
2869         int x= block_w*mb_x2 + block_w/2;
2870         int y= block_w*mb_y2 + block_w/2;
2871
2872         add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2873                     x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2874
2875         for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2876             for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2877                 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2878                 int obmc_v= obmc[index];
2879                 int d;
2880                 if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2881                 if(x<0) obmc_v += obmc[index + block_w];
2882                 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2883                 if(x+block_w>w) obmc_v += obmc[index - block_w];
2884                 //FIXME precalc this or simplify it somehow else
2885
2886                 d = -dst[index] + (1<<(FRAC_BITS-1));
2887                 dst[index] = d;
2888                 ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2889                 aa += obmc_v * obmc_v; //FIXME precalclate this
2890             }
2891         }
2892     }
2893     *b= backup;
2894
2895     return av_clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we should not need clipping
2896 }
2897
2898 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2899     const int b_stride = s->b_width << s->block_max_depth;
2900     const int b_height = s->b_height<< s->block_max_depth;
2901     int index= x + y*b_stride;
2902     const BlockNode *b     = &s->block[index];
2903     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2904     const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2905     const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2906     const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2907     int dmx, dmy;
2908 //  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2909 //  int my_context= av_log2(2*FFABS(left->my - top->my));
2910
2911     if(x<0 || x>=b_stride || y>=b_height)
2912         return 0;
2913 /*
2914 1            0      0
2915 01X          1-2    1
2916 001XX        3-6    2-3
2917 0001XXX      7-14   4-7
2918 00001XXXX   15-30   8-15
2919 */
2920 //FIXME try accurate rate
2921 //FIXME intra and inter predictors if surrounding blocks arent the same type
2922     if(b->type & BLOCK_INTRA){
2923         return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2924                    + av_log2(2*FFABS(left->color[1] - b->color[1]))
2925                    + av_log2(2*FFABS(left->color[2] - b->color[2])));
2926     }else{
2927         pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2928         dmx-= b->mx;
2929         dmy-= b->my;
2930         return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2931                     + av_log2(2*FFABS(dmy))
2932                     + av_log2(2*b->ref));
2933     }
2934 }
2935
2936 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2937     Plane *p= &s->plane[plane_index];
2938     const int block_size = MB_SIZE >> s->block_max_depth;
2939     const int block_w    = plane_index ? block_size/2 : block_size;
2940     const int obmc_stride= plane_index ? block_size : 2*block_size;
2941     const int ref_stride= s->current_picture.linesize[plane_index];
2942     uint8_t *dst= s->current_picture.data[plane_index];
2943     uint8_t *src= s->  input_picture.data[plane_index];
2944     DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2945     uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
2946     uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
2947     const int b_stride = s->b_width << s->block_max_depth;
2948     const int b_height = s->b_height<< s->block_max_depth;
2949     const int w= p->width;
2950     const int h= p->height;
2951     int distortion;
2952     int rate= 0;
2953     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2954     int sx= block_w*mb_x - block_w/2;
2955     int sy= block_w*mb_y - block_w/2;
2956     int x0= FFMAX(0,-sx);
2957     int y0= FFMAX(0,-sy);
2958     int x1= FFMIN(block_w*2, w-sx);
2959     int y1= FFMIN(block_w*2, h-sy);
2960     int i,x,y;
2961
2962     pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2963
2964     for(y=y0; y<y1; y++){
2965         const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2966         const DWTELEM *pred1 = pred + y*obmc_stride;
2967         uint8_t *cur1 = cur + y*ref_stride;
2968         uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2969         for(x=x0; x<x1; x++){
2970             int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2971             v = (v + pred1[x]) >> FRAC_BITS;
2972             if(v&(~255)) v= ~(v>>31);
2973             dst1[x] = v;
2974         }
2975     }
2976
2977     /* copy the regions where obmc[] = (uint8_t)256 */
2978     if(LOG2_OBMC_MAX == 8
2979         && (mb_x == 0 || mb_x == b_stride-1)
2980         && (mb_y == 0 || mb_y == b_height-1)){
2981         if(mb_x == 0)
2982             x1 = block_w;
2983         else
2984             x0 = block_w;
2985         if(mb_y == 0)
2986             y1 = block_w;
2987         else
2988             y0 = block_w;
2989         for(y=y0; y<y1; y++)
2990             memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2991     }
2992
2993     if(block_w==16){
2994         /* FIXME rearrange dsputil to fit 32x32 cmp functions */
2995         /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
2996         /* FIXME cmps overlap but don't cover the wavelet's whole support,
2997          * so improving the score of one block is not strictly guaranteed to
2998          * improve the score of the whole frame, so iterative motion est
2999          * doesn't always converge. */
3000         if(s->avctx->me_cmp == FF_CMP_W97)
3001             distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3002         else if(s->avctx->me_cmp == FF_CMP_W53)
3003             distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3004         else{
3005             distortion = 0;
3006             for(i=0; i<4; i++){
3007                 int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3008                 distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3009             }
3010         }
3011     }else{
3012         assert(block_w==8);
3013         distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3014     }
3015
3016     if(plane_index==0){
3017         for(i=0; i<4; i++){
3018 /* ..RRr
3019  * .RXx.
3020  * rxx..
3021  */
3022             rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3023         }
3024         if(mb_x == b_stride-2)
3025             rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3026     }
3027     return distortion + rate*penalty_factor;
3028 }
3029
3030 static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3031     int i, y2;
3032     Plane *p= &s->plane[plane_index];
3033     const int block_size = MB_SIZE >> s->block_max_depth;
3034     const int block_w    = plane_index ? block_size/2 : block_size;
3035     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3036     const int obmc_stride= plane_index ? block_size : 2*block_size;
3037     const int ref_stride= s->current_picture.linesize[plane_index];
3038     uint8_t *dst= s->current_picture.data[plane_index];
3039     uint8_t *src= s-> input_picture.data[plane_index];
3040     static const DWTELEM zero_dst[4096]; //FIXME
3041     const int b_stride = s->b_width << s->block_max_depth;
3042     const int w= p->width;
3043     const int h= p->height;
3044     int distortion= 0;
3045     int rate= 0;
3046     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3047
3048     for(i=0; i<9; i++){
3049         int mb_x2= mb_x + (i%3) - 1;
3050         int mb_y2= mb_y + (i/3) - 1;
3051         int x= block_w*mb_x2 + block_w/2;
3052         int y= block_w*mb_y2 + block_w/2;
3053
3054         add_yblock(s, 0, NULL, zero_dst, dst, obmc,
3055                    x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3056
3057         //FIXME find a cleaner/simpler way to skip the outside stuff
3058         for(y2= y; y2<0; y2++)
3059             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3060         for(y2= h; y2<y+block_w; y2++)
3061             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3062         if(x<0){
3063             for(y2= y; y2<y+block_w; y2++)
3064                 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3065         }
3066         if(x+block_w > w){
3067             for(y2= y; y2<y+block_w; y2++)
3068                 memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3069         }
3070
3071         assert(block_w== 8 || block_w==16);
3072         distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3073     }
3074
3075     if(plane_index==0){
3076         BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3077         int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3078
3079 /* ..RRRr
3080  * .RXXx.
3081  * .RXXx.
3082  * rxxx.
3083  */
3084         if(merged)
3085             rate = get_block_bits(s, mb_x, mb_y, 2);
3086         for(i=merged?4:0; i<9; i++){
3087             static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3088             rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3089         }
3090     }
3091     return distortion + rate*penalty_factor;
3092 }
3093
3094 static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3095     const int b_stride= s->b_width << s->block_max_depth;
3096     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3097     BlockNode backup= *block;
3098     int rd, index, value;
3099
3100     assert(mb_x>=0 && mb_y>=0);
3101     assert(mb_x<b_stride);
3102
3103     if(intra){
3104         block->color[0] = p[0];
3105         block->color[1] = p[1];
3106         block->color[2] = p[2];
3107         block->type |= BLOCK_INTRA;
3108     }else{
3109         index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3110         value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3111         if(s->me_cache[index] == value)
3112             return 0;
3113         s->me_cache[index]= value;
3114
3115         block->mx= p[0];
3116         block->my= p[1];
3117         block->type &= ~BLOCK_INTRA;
3118     }
3119
3120     rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3121
3122 //FIXME chroma
3123     if(rd < *best_rd){
3124         *best_rd= rd;
3125         return 1;
3126     }else{
3127         *block= backup;
3128         return 0;
3129     }
3130 }
3131
3132 /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3133 static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3134     int p[2] = {p0, p1};
3135     return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3136 }
3137
3138 static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3139     const int b_stride= s->b_width << s->block_max_depth;
3140     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3141     BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3142     int rd, index, value;
3143
3144     assert(mb_x>=0 && mb_y>=0);
3145     assert(mb_x<b_stride);
3146     assert(((mb_x|mb_y)&1) == 0);
3147
3148     index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3149     value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3150     if(s->me_cache[index] == value)
3151         return 0;
3152     s->me_cache[index]= value;
3153
3154     block->mx= p0;
3155     block->my= p1;
3156     block->ref= ref;
3157     block->type &= ~BLOCK_INTRA;
3158     block[1]= block[b_stride]= block[b_stride+1]= *block;
3159
3160     rd= get_4block_rd(s, mb_x, mb_y, 0);
3161
3162 //FIXME chroma
3163     if(rd < *best_rd){
3164         *best_rd= rd;
3165         return 1;
3166     }else{
3167         block[0]= backup[0];
3168         block[1]= backup[1];
3169         block[b_stride]= backup[2];
3170         block[b_stride+1]= backup[3];
3171         return 0;
3172     }
3173 }
3174
3175 static void iterative_me(SnowContext *s){
3176     int pass, mb_x, mb_y;
3177     const int b_width = s->b_width  << s->block_max_depth;
3178     const int b_height= s->b_height << s->block_max_depth;
3179     const int b_stride= b_width;
3180     int color[3];
3181
3182     {
3183         RangeCoder r = s->c;
3184         uint8_t state[sizeof(s->block_state)];
3185         memcpy(state, s->block_state, sizeof(s->block_state));
3186         for(mb_y= 0; mb_y<s->b_height; mb_y++)
3187             for(mb_x= 0; mb_x<s->b_width; mb_x++)
3188                 encode_q_branch(s, 0, mb_x, mb_y);
3189         s->c = r;
3190         memcpy(s->block_state, state, sizeof(s->block_state));
3191     }
3192
3193     for(pass=0; pass<25; pass++){
3194         int change= 0;
3195
3196         for(mb_y= 0; mb_y<b_height; mb_y++){
3197             for(mb_x= 0; mb_x<b_width; mb_x++){
3198                 int dia_change, i, j, ref;
3199                 int best_rd= INT_MAX, ref_rd;
3200                 BlockNode backup, ref_b;
3201                 const int index= mb_x + mb_y * b_stride;
3202                 BlockNode *block= &s->block[index];
3203                 BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3204                 BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3205                 BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3206                 BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3207                 BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3208                 BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3209                 BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3210                 BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3211                 const int b_w= (MB_SIZE >> s->block_max_depth);
3212                 uint8_t obmc_edged[b_w*2][b_w*2];
3213
3214                 if(pass && (block->type & BLOCK_OPT))
3215                     continue;
3216                 block->type |= BLOCK_OPT;
3217
3218                 backup= *block;
3219
3220                 if(!s->me_cache_generation)
3221                     memset(s->me_cache, 0, sizeof(s->me_cache));
3222                 s->me_cache_generation += 1<<22;
3223
3224                 //FIXME precalc
3225                 {
3226                     int x, y;
3227                     memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3228                     if(mb_x==0)
3229                         for(y=0; y<b_w*2; y++)
3230                             memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3231                     if(mb_x==b_stride-1)
3232                         for(y=0; y<b_w*2; y++)
3233                             memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3234                     if(mb_y==0){
3235                         for(x=0; x<b_w*2; x++)
3236                             obmc_edged[0][x] += obmc_edged[b_w-1][x];
3237                         for(y=1; y<b_w; y++)
3238                             memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3239                     }
3240                     if(mb_y==b_height-1){
3241                         for(x=0; x<b_w*2; x++)
3242                             obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3243                         for(y=b_w; y<b_w*2-1; y++)
3244                             memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3245                     }
3246                 }
3247
3248                 //skip stuff outside the picture
3249                 if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3250                 {
3251                     uint8_t *src= s->  input_picture.data[0];
3252                     uint8_t *dst= s->current_picture.data[0];
3253                     const int stride= s->current_picture.linesize[0];
3254                     const int block_w= MB_SIZE >> s->block_max_depth;
3255                     const int sx= block_w*mb_x - block_w/2;
3256                     const int sy= block_w*mb_y - block_w/2;
3257                     const int w= s->plane[0].width;
3258                     const int h= s->plane[0].height;
3259                     int y;
3260
3261                     for(y=sy; y<0; y++)
3262                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3263                     for(y=h; y<sy+block_w*2; y++)
3264                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3265                     if(sx<0){
3266                         for(y=sy; y<sy+block_w*2; y++)
3267                             memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3268                     }
3269                     if(sx+block_w*2 > w){
3270                         for(y=sy; y<sy+block_w*2; y++)
3271                             memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3272                     }
3273                 }
3274
3275                 // intra(black) = neighbors' contribution to the current block
3276                 for(i=0; i<3; i++)
3277                     color[i]= get_dc(s, mb_x, mb_y, i);
3278
3279                 // get previous score (cannot be cached due to OBMC)
3280                 if(pass > 0 && (block->type&BLOCK_INTRA)){
3281                     int color0[3]= {block->color[0], block->color[1], block->color[2]};
3282                     check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3283                 }else
3284                     check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3285
3286                 ref_b= *block;
3287                 ref_rd= best_rd;
3288                 for(ref=0; ref < s->ref_frames; ref++){
3289                     int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3290                     if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3291                         continue;
3292                     block->ref= ref;
3293                     best_rd= INT_MAX;
3294
3295                     check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3296                     check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3297                     if(tb)
3298                         check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3299                     if(lb)
3300                         check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3301                     if(rb)
3302                         check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3303                     if(bb)
3304                         check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3305
3306                     /* fullpel ME */
3307                     //FIXME avoid subpel interpol / round to nearest integer
3308                     do{
3309                         dia_change=0;
3310                         for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3311                             for(j=0; j<i; j++){
3312                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3313                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3314                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3315                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3316                             }
3317                         }
3318                     }while(dia_change);
3319                     /* subpel ME */
3320                     do{
3321                         static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3322                         dia_change=0;
3323                         for(i=0; i<8; i++)
3324                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3325                     }while(dia_change);
3326                     //FIXME or try the standard 2 pass qpel or similar
3327
3328                     mvr[0][0]= block->mx;
3329                     mvr[0][1]= block->my;
3330                     if(ref_rd > best_rd){
3331                         ref_rd= best_rd;
3332                         ref_b= *block;
3333                     }
3334                 }
3335                 best_rd= ref_rd;
3336                 *block= ref_b;
3337 #if 1
3338                 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3339                 //FIXME RD style color selection
3340 #endif
3341                 if(!same_block(block, &backup)){
3342                     if(tb ) tb ->type &= ~BLOCK_OPT;
3343                     if(lb ) lb ->type &= ~BLOCK_OPT;
3344                     if(rb ) rb ->type &= ~BLOCK_OPT;
3345                     if(bb ) bb ->type &= ~BLOCK_OPT;
3346                     if(tlb) tlb->type &= ~BLOCK_OPT;
3347                     if(trb) trb->type &= ~BLOCK_OPT;
3348                     if(blb) blb->type &= ~BLOCK_OPT;
3349                     if(brb) brb->type &= ~BLOCK_OPT;
3350                     change ++;
3351                 }
3352             }
3353         }
3354         av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3355         if(!change)
3356             break;
3357     }
3358
3359     if(s->block_max_depth == 1){
3360         int change= 0;
3361         for(mb_y= 0; mb_y<b_height; mb_y+=2){
3362             for(mb_x= 0; mb_x<b_width; mb_x+=2){
3363                 int i;
3364                 int best_rd, init_rd;
3365                 const int index= mb_x + mb_y * b_stride;
3366                 BlockNode *b[4];
3367
3368                 b[0]= &s->block[index];
3369                 b[1]= b[0]+1;
3370                 b[2]= b[0]+b_stride;
3371                 b[3]= b[2]+1;
3372                 if(same_block(b[0], b[1]) &&
3373                    same_block(b[0], b[2]) &&
3374                    same_block(b[0], b[3]))
3375                     continue;
3376
3377                 if(!s->me_cache_generation)
3378                     memset(s->me_cache, 0, sizeof(s->me_cache));
3379                 s->me_cache_generation += 1<<22;
3380
3381                 init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3382
3383                 //FIXME more multiref search?
3384                 check_4block_inter(s, mb_x, mb_y,
3385                                    (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3386                                    (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3387
3388                 for(i=0; i<4; i++)
3389                     if(!(b[i]->type&BLOCK_INTRA))
3390                         check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3391
3392                 if(init_rd != best_rd)
3393                     change++;
3394             }
3395         }
3396         av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3397     }
3398 }
3399
3400 static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3401     const int level= b->level;
3402     const int w= b->width;
3403     const int h= b->height;
3404     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3405     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3406     int x,y, thres1, thres2;
3407 //    START_TIMER
3408
3409     if(s->qlog == LOSSLESS_QLOG) return;
3410
3411     bias= bias ? 0 : (3*qmul)>>3;
3412     thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3413     thres2= 2*thres1;
3414
3415     if(!bias){
3416         for(y=0; y<h; y++){
3417             for(x=0; x<w; x++){
3418                 int i= src[x + y*stride];
3419
3420                 if((unsigned)(i+thres1) > thres2){
3421                     if(i>=0){
3422                         i<<= QEXPSHIFT;
3423                         i/= qmul; //FIXME optimize
3424                         src[x + y*stride]=  i;
3425                     }else{
3426                         i= -i;
3427                         i<<= QEXPSHIFT;
3428                         i/= qmul; //FIXME optimize
3429                         src[x + y*stride]= -i;
3430                     }
3431                 }else
3432                     src[x + y*stride]= 0;
3433             }
3434         }
3435     }else{
3436         for(y=0; y<h; y++){
3437             for(x=0; x<w; x++){
3438                 int i= src[x + y*stride];
3439
3440                 if((unsigned)(i+thres1) > thres2){
3441                     if(i>=0){
3442                         i<<= QEXPSHIFT;
3443                         i= (i + bias) / qmul; //FIXME optimize
3444                         src[x + y*stride]=  i;
3445                     }else{
3446                         i= -i;
3447                         i<<= QEXPSHIFT;
3448                         i= (i + bias) / qmul; //FIXME optimize
3449                         src[x + y*stride]= -i;
3450                     }
3451                 }else
3452                     src[x + y*stride]= 0;
3453             }
3454         }
3455     }
3456     if(level+1 == s->spatial_decomposition_count){
3457 //        STOP_TIMER("quantize")
3458     }
3459 }
3460
3461 static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3462     const int w= b->width;
3463     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3464     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3465     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3466     int x,y;
3467     START_TIMER
3468
3469     if(s->qlog == LOSSLESS_QLOG) return;
3470
3471     for(y=start_y; y<end_y; y++){
3472 //        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3473         DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3474         for(x=0; x<w; x++){
3475             int i= line[x];
3476             if(i<0){
3477                 line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3478             }else if(i>0){
3479                 line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3480             }
3481         }
3482     }
3483     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3484         STOP_TIMER("dquant")
3485     }
3486 }
3487
3488 static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3489     const int w= b->width;
3490     const int h= b->height;
3491     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3492     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3493     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3494     int x,y;
3495     START_TIMER
3496
3497     if(s->qlog == LOSSLESS_QLOG) return;
3498
3499     for(y=0; y<h; y++){
3500         for(x=0; x<w; x++){
3501             int i= src[x + y*stride];
3502             if(i<0){
3503                 src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3504             }else if(i>0){
3505                 src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3506             }
3507         }
3508     }
3509     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3510         STOP_TIMER("dquant")
3511     }
3512 }
3513
3514 static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3515     const int w= b->width;
3516     const int h= b->height;
3517     int x,y;
3518
3519     for(y=h-1; y>=0; y--){
3520         for(x=w-1; x>=0; x--){
3521             int i= x + y*stride;
3522
3523             if(x){
3524                 if(use_median){
3525                     if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3526                     else  src[i] -= src[i - 1];
3527                 }else{
3528                     if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3529                     else  src[i] -= src[i - 1];
3530                 }
3531             }else{
3532                 if(y) src[i] -= src[i - stride];
3533             }
3534         }
3535     }
3536 }
3537
3538 static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3539     const int w= b->width;
3540     int x,y;
3541
3542 //    START_TIMER
3543
3544     DWTELEM * line=0; // silence silly "could be used without having been initialized" warning
3545     DWTELEM * prev;
3546
3547     if (start_y != 0)
3548         line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3549
3550     for(y=start_y; y<end_y; y++){
3551         prev = line;
3552 //        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3553         line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3554         for(x=0; x<w; x++){
3555             if(x){
3556                 if(use_median){
3557                     if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3558                     else  line[x] += line[x - 1];
3559                 }else{
3560                     if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3561                     else  line[x] += line[x - 1];
3562                 }
3563             }else{
3564                 if(y) line[x] += prev[x];
3565             }
3566         }
3567     }
3568
3569 //    STOP_TIMER("correlate")
3570 }
3571
3572 static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3573     const int w= b->width;
3574     const int h= b->height;
3575     int x,y;
3576
3577     for(y=0; y<h; y++){
3578         for(x=0; x<w; x++){
3579             int i= x + y*stride;
3580
3581             if(x){
3582                 if(use_median){
3583                     if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3584                     else  src[i] += src[i - 1];
3585                 }else{
3586                     if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3587                     else  src[i] += src[i - 1];
3588                 }
3589             }else{
3590                 if(y) src[i] += src[i - stride];
3591             }
3592         }
3593     }
3594 }
3595
3596 static void encode_header(SnowContext *s){
3597     int plane_index, level, orientation;
3598     uint8_t kstate[32];
3599
3600     memset(kstate, MID_STATE, sizeof(kstate));
3601
3602     put_rac(&s->c, kstate, s->keyframe);
3603     if(s->keyframe || s->always_reset){
3604         reset_contexts(s);
3605         s->last_spatial_decomposition_type=
3606         s->last_qlog=
3607         s->last_qbias=
3608         s->last_mv_scale=
3609         s->last_block_max_depth= 0;
3610     }
3611     if(s->keyframe){
3612         put_symbol(&s->c, s->header_state, s->version, 0);
3613         put_rac(&s->c, s->header_state, s->always_reset);
3614         put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3615         put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3616         put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3617         put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3618         put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3619         put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3620         put_rac(&s->c, s->header_state, s->spatial_scalability);
3621 //        put_rac(&s->c, s->header_state, s->rate_scalability);
3622         put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3623
3624         for(plane_index=0; plane_index<2; plane_index++){
3625             for(level=0; level<s->spatial_decomposition_count; level++){
3626                 for(orientation=level ? 1:0; orientation<4; orientation++){
3627                     if(orientation==2) continue;
3628                     put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3629                 }
3630             }
3631         }
3632     }
3633     put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3634     put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3635     put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3636     put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3637     put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3638
3639     s->last_spatial_decomposition_type= s->spatial_decomposition_type;
3640     s->last_qlog                      = s->qlog;
3641     s->last_qbias                     = s->qbias;
3642     s->last_mv_scale                  = s->mv_scale;
3643     s->last_block_max_depth           = s->block_max_depth;
3644 }
3645
3646 static int decode_header(SnowContext *s){
3647     int plane_index, level, orientation;
3648     uint8_t kstate[32];
3649
3650     memset(kstate, MID_STATE, sizeof(kstate));
3651
3652     s->keyframe= get_rac(&s->c, kstate);
3653     if(s->keyframe || s->always_reset){
3654         reset_contexts(s);
3655         s->spatial_decomposition_type=
3656         s->qlog=
3657         s->qbias=
3658         s->mv_scale=
3659         s->block_max_depth= 0;
3660     }
3661     if(s->keyframe){
3662         s->version= get_symbol(&s->c, s->header_state, 0);
3663         if(s->version>0){
3664             av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3665             return -1;
3666         }
3667         s->always_reset= get_rac(&s->c, s->header_state);
3668         s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3669         s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3670         s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3671         s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3672         s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3673         s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3674         s->spatial_scalability= get_rac(&s->c, s->header_state);
3675 //        s->rate_scalability= get_rac(&s->c, s->header_state);
3676         s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3677
3678         for(plane_index=0; plane_index<3; plane_index++){
3679             for(level=0; level<s->spatial_decomposition_count; level++){
3680                 for(orientation=level ? 1:0; orientation<4; orientation++){
3681                     int q;
3682                     if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3683                     else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3684                     else                    q= get_symbol(&s->c, s->header_state, 1);
3685                     s->plane[plane_index].band[level][orientation].qlog= q;
3686                 }
3687             }
3688         }
3689     }
3690
3691     s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
3692     if(s->spatial_decomposition_type > 2){
3693         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3694         return -1;
3695     }
3696
3697     s->qlog           += get_symbol(&s->c, s->header_state, 1);
3698     s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
3699     s->qbias          += get_symbol(&s->c, s->header_state, 1);
3700     s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
3701     if(s->block_max_depth > 1 || s->block_max_depth < 0){
3702         av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3703         s->block_max_depth= 0;
3704         return -1;
3705     }
3706
3707     return 0;
3708 }
3709
3710 static void init_qexp(void){
3711     int i;
3712     double v=128;
3713
3714     for(i=0; i<QROOT; i++){
3715         qexp[i]= lrintf(v);
3716         v *= pow(2, 1.0 / QROOT);
3717     }
3718 }
3719
3720 static int common_init(AVCodecContext *avctx){
3721     SnowContext *s = avctx->priv_data;
3722     int width, height;
3723     int level, orientation, plane_index, dec;
3724     int i, j;
3725
3726     s->avctx= avctx;
3727
3728     dsputil_init(&s->dsp, avctx);
3729
3730 #define mcf(dx,dy)\
3731     s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3732     s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3733         s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3734     s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3735     s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3736         s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3737
3738     mcf( 0, 0)
3739     mcf( 4, 0)
3740     mcf( 8, 0)
3741     mcf(12, 0)
3742     mcf( 0, 4)
3743     mcf( 4, 4)
3744     mcf( 8, 4)
3745     mcf(12, 4)
3746     mcf( 0, 8)
3747     mcf( 4, 8)
3748     mcf( 8, 8)
3749     mcf(12, 8)
3750     mcf( 0,12)
3751     mcf( 4,12)
3752     mcf( 8,12)
3753     mcf(12,12)
3754
3755 #define mcfh(dx,dy)\
3756     s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3757     s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3758         mc_block_hpel ## dx ## dy ## 16;\
3759     s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3760     s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3761         mc_block_hpel ## dx ## dy ## 8;
3762
3763     mcfh(0, 0)
3764     mcfh(8, 0)
3765     mcfh(0, 8)
3766     mcfh(8, 8)
3767
3768     if(!qexp[0])
3769         init_qexp();
3770
3771     dec= s->spatial_decomposition_count= 5;
3772     s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3773
3774     s->chroma_h_shift= 1; //FIXME XXX
3775     s->chroma_v_shift= 1;
3776
3777 //    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3778
3779     width= s->avctx->width;
3780     height= s->avctx->height;
3781
3782     s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3783
3784     s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3785     s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3786
3787     for(plane_index=0; plane_index<3; plane_index++){
3788         int w= s->avctx->width;
3789         int h= s->avctx->height;
3790
3791         if(plane_index){
3792             w>>= s->chroma_h_shift;
3793             h>>= s->chroma_v_shift;
3794         }
3795         s->plane[plane_index].width = w;
3796         s->plane[plane_index].height= h;
3797 //av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3798         for(level=s->spatial_decomposition_count-1; level>=0; level--){
3799             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3800                 SubBand *b= &s->plane[plane_index].band[level][orientation];
3801
3802                 b->buf= s->spatial_dwt_buffer;
3803                 b->level= level;
3804                 b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3805                 b->width = (w + !(orientation&1))>>1;
3806                 b->height= (h + !(orientation>1))>>1;
3807
3808                 b->stride_line = 1 << (s->spatial_decomposition_count - level);
3809                 b->buf_x_offset = 0;
3810                 b->buf_y_offset = 0;
3811
3812                 if(orientation&1){
3813                     b->buf += (w+1)>>1;
3814                     b->buf_x_offset = (w+1)>>1;
3815                 }
3816                 if(orientation>1){
3817                     b->buf += b->stride>>1;
3818                     b->buf_y_offset = b->stride_line >> 1;
3819                 }
3820
3821                 if(level)
3822                     b->parent= &s->plane[plane_index].band[level-1][orientation];
3823                 b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3824             }
3825             w= (w+1)>>1;
3826             h= (h+1)>>1;
3827         }
3828     }
3829
3830     for(i=0; i<MAX_REF_FRAMES; i++)
3831         for(j=0; j<MAX_REF_FRAMES; j++)
3832             scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3833
3834     reset_contexts(s);
3835 /*
3836     width= s->width= avctx->width;
3837     height= s->height= avctx->height;
3838
3839     assert(width && height);
3840 */
3841     s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3842
3843     return 0;
3844 }
3845
3846 static int qscale2qlog(int qscale){
3847     return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3848            + 61*QROOT/8; //<64 >60
3849 }
3850
3851 static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3852 {
3853     /* estimate the frame's complexity as a sum of weighted dwt coefs.
3854      * FIXME we know exact mv bits at this point,
3855      * but ratecontrol isn't set up to include them. */
3856     uint32_t coef_sum= 0;
3857     int level, orientation, delta_qlog;
3858
3859     for(level=0; level<s->spatial_decomposition_count; level++){
3860         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3861             SubBand *b= &s->plane[0].band[level][orientation];
3862             DWTELEM *buf= b->buf;
3863             const int w= b->width;
3864             const int h= b->height;
3865             const int stride= b->stride;
3866             const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
3867             const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3868             const int qdiv= (1<<16)/qmul;
3869             int x, y;
3870             if(orientation==0)
3871                 decorrelate(s, b, buf, stride, 1, 0);
3872             for(y=0; y<h; y++)
3873                 for(x=0; x<w; x++)
3874                     coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3875             if(orientation==0)
3876                 correlate(s, b, buf, stride, 1, 0);
3877         }
3878     }
3879
3880     /* ugly, ratecontrol just takes a sqrt again */
3881     coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3882     assert(coef_sum < INT_MAX);
3883
3884     if(pict->pict_type == I_TYPE){
3885         s->m.current_picture.mb_var_sum= coef_sum;
3886         s->m.current_picture.mc_mb_var_sum= 0;
3887     }else{
3888         s->m.current_picture.mc_mb_var_sum= coef_sum;
3889         s->m.current_picture.mb_var_sum= 0;
3890     }
3891
3892     pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3893     if (pict->quality < 0)
3894         return INT_MIN;
3895     s->lambda= pict->quality * 3/2;
3896     delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3897     s->qlog+= delta_qlog;
3898     return delta_qlog;
3899 }
3900
3901 static void calculate_vissual_weight(SnowContext *s, Plane *p){
3902     int width = p->width;
3903     int height= p->height;
3904     int level, orientation, x, y;
3905
3906     for(level=0; level<s->spatial_decomposition_count; level++){
3907         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3908             SubBand *b= &p->band[level][orientation];
3909             DWTELEM *buf= b->buf;
3910             int64_t error=0;
3911
3912             memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3913             buf[b->width/2 + b->height/2*b->stride]= 256*256;
3914             ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3915             for(y=0; y<height; y++){
3916                 for(x=0; x<width; x++){
3917                     int64_t d= s->spatial_dwt_buffer[x + y*width];
3918                     error += d*d;
3919                 }
3920             }
3921
3922             b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3923 //            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3924         }
3925     }
3926 }
3927
3928 static int encode_init(AVCodecContext *avctx)
3929 {
3930     SnowContext *s = avctx->priv_data;
3931     int plane_index;
3932
3933     if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3934         av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3935                "use vstrict=-2 / -strict -2 to use it anyway\n");
3936         return -1;
3937     }
3938
3939     if(avctx->prediction_method == DWT_97
3940        && (avctx->flags & CODEC_FLAG_QSCALE)
3941        && avctx->global_quality == 0){
3942         av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
3943         return -1;
3944     }
3945
3946     common_init(avctx);
3947     alloc_blocks(s);
3948
3949     s->version=0;
3950
3951     s->m.avctx   = avctx;
3952     s->m.flags   = avctx->flags;
3953     s->m.bit_rate= avctx->bit_rate;
3954
3955     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3956     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3957     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3958     s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3959     h263_encode_init(&s->m); //mv_penalty
3960
3961     s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
3962
3963     if(avctx->flags&CODEC_FLAG_PASS1){
3964         if(!avctx->stats_out)
3965             avctx->stats_out = av_mallocz(256);
3966     }
3967     if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
3968         if(ff_rate_control_init(&s->m) < 0)
3969             return -1;
3970     }
3971     s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
3972
3973     for(plane_index=0; plane_index<3; plane_index++){
3974         calculate_vissual_weight(s, &s->plane[plane_index]);
3975     }
3976
3977
3978     avctx->coded_frame= &s->current_picture;
3979     switch(avctx->pix_fmt){
3980 //    case PIX_FMT_YUV444P:
3981 //    case PIX_FMT_YUV422P:
3982     case PIX_FMT_YUV420P:
3983     case PIX_FMT_GRAY8:
3984 //    case PIX_FMT_YUV411P:
3985 //    case PIX_FMT_YUV410P:
3986         s->colorspace_type= 0;
3987         break;
3988 /*    case PIX_FMT_RGB32:
3989         s->colorspace= 1;
3990         break;*/
3991     default:
3992         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
3993         return -1;
3994     }
3995 //    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
3996     s->chroma_h_shift= 1;
3997     s->chroma_v_shift= 1;
3998
3999     ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4000     ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4001
4002     s->avctx->get_buffer(s->avctx, &s->input_picture);
4003
4004     if(s->avctx->me_method == ME_ITER){
4005         int i;
4006         int size= s->b_width * s->b_height << 2*s->block_max_depth;
4007         for(i=0; i<s->max_ref_frames; i++){
4008             s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4009             s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4010         }
4011     }
4012
4013     return 0;
4014 }
4015
4016 static int frame_start(SnowContext *s){
4017    AVFrame tmp;
4018    int w= s->avctx->width; //FIXME round up to x16 ?
4019    int h= s->avctx->height;
4020
4021     if(s->current_picture.data[0]){
4022         draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4023         draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4024         draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4025     }
4026
4027     tmp= s->last_picture[s->max_ref_frames-1];
4028     memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4029     s->last_picture[0]= s->current_picture;
4030     s->current_picture= tmp;
4031
4032     if(s->keyframe){
4033         s->ref_frames= 0;
4034     }else{
4035         int i;
4036         for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4037             if(i && s->last_picture[i-1].key_frame)
4038                 break;
4039         s->ref_frames= i;
4040     }
4041
4042     s->current_picture.reference= 1;
4043     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
4044         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
4045         return -1;
4046     }
4047
4048     s->current_picture.key_frame= s->keyframe;
4049
4050     return 0;
4051 }
4052
4053 static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
4054     SnowContext *s = avctx->priv_data;
4055     RangeCoder * const c= &s->c;
4056     AVFrame *pict = data;
4057     const int width= s->avctx->width;
4058     const int height= s->avctx->height;
4059     int level, orientation, plane_index, i, y;
4060     uint8_t rc_header_bak[sizeof(s->header_state)];
4061     uint8_t rc_block_bak[sizeof(s->block_state)];
4062
4063     ff_init_range_encoder(c, buf, buf_size);
4064     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4065
4066     for(i=0; i<3; i++){
4067         int shift= !!i;
4068         for(y=0; y<(height>>shift); y++)
4069             memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
4070                    &pict->data[i][y * pict->linesize[i]],
4071                    width>>shift);
4072     }
4073     s->new_picture = *pict;
4074
4075     s->m.picture_number= avctx->frame_number;
4076     if(avctx->flags&CODEC_FLAG_PASS2){
4077         s->m.pict_type =
4078         pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
4079         s->keyframe= pict->pict_type==FF_I_TYPE;
4080         if(!(avctx->flags&CODEC_FLAG_QSCALE)) {
4081             pict->quality= ff_rate_estimate_qscale(&s->m, 0);
4082             if (pict->quality < 0)
4083                 return -1;
4084         }
4085     }else{
4086         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
4087         s->m.pict_type=
4088         pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
4089     }
4090
4091     if(s->pass1_rc && avctx->frame_number == 0)
4092         pict->quality= 2*FF_QP2LAMBDA;
4093     if(pict->quality){
4094         s->qlog= qscale2qlog(pict->quality);
4095         s->lambda = pict->quality * 3/2;
4096     }
4097     if(s->qlog < 0 || (!pict->quality && (avctx->flags & CODEC_FLAG_QSCALE))){
4098         s->qlog= LOSSLESS_QLOG;
4099         s->lambda = 0;
4100     }//else keep previous frame's qlog until after motion est
4101
4102     frame_start(s);
4103
4104     s->m.current_picture_ptr= &s->m.current_picture;
4105     if(pict->pict_type == P_TYPE){
4106         int block_width = (width +15)>>4;
4107         int block_height= (height+15)>>4;
4108         int stride= s->current_picture.linesize[0];
4109
4110         assert(s->current_picture.data[0]);
4111         assert(s->last_picture[0].data[0]);
4112
4113         s->m.avctx= s->avctx;
4114         s->m.current_picture.data[0]= s->current_picture.data[0];
4115         s->m.   last_picture.data[0]= s->last_picture[0].data[0];
4116         s->m.    new_picture.data[0]= s->  input_picture.data[0];
4117         s->m.   last_picture_ptr= &s->m.   last_picture;
4118         s->m.linesize=
4119         s->m.   last_picture.linesize[0]=
4120         s->m.    new_picture.linesize[0]=
4121         s->m.current_picture.linesize[0]= stride;
4122         s->m.uvlinesize= s->current_picture.linesize[1];
4123         s->m.width = width;
4124         s->m.height= height;
4125         s->m.mb_width = block_width;
4126         s->m.mb_height= block_height;
4127         s->m.mb_stride=   s->m.mb_width+1;
4128         s->m.b8_stride= 2*s->m.mb_width+1;
4129         s->m.f_code=1;
4130         s->m.pict_type= pict->pict_type;
4131         s->m.me_method= s->avctx->me_method;
4132         s->m.me.scene_change_score=0;
4133         s->m.flags= s->avctx->flags;
4134         s->m.quarter_sample= (s->avctx->flags & CODEC_FLAG_QPEL)!=0;
4135         s->m.out_format= FMT_H263;
4136         s->m.unrestricted_mv= 1;
4137
4138         s->m.lambda = s->lambda;
4139         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
4140         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
4141
4142         s->m.dsp= s->dsp; //move
4143         ff_init_me(&s->m);
4144         s->dsp= s->m.dsp;
4145     }
4146
4147     if(s->pass1_rc){
4148         memcpy(rc_header_bak, s->header_state, sizeof(s->header_state));
4149         memcpy(rc_block_bak, s->block_state, sizeof(s->block_state));
4150     }
4151
4152 redo_frame:
4153
4154     s->m.pict_type = pict->pict_type;
4155     s->qbias= pict->pict_type == P_TYPE ? 2 : 0;
4156
4157     encode_header(s);
4158     s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4159     encode_blocks(s, 1);
4160     s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
4161
4162     for(plane_index=0; plane_index<3; plane_index++){
4163         Plane *p= &s->plane[plane_index];
4164         int w= p->width;
4165         int h= p->height;
4166         int x, y;
4167 //        int bits= put_bits_count(&s->c.pb);
4168
4169     if(!(avctx->flags2 & CODEC_FLAG2_MEMC_ONLY)){
4170         //FIXME optimize
4171      if(pict->data[plane_index]) //FIXME gray hack
4172         for(y=0; y<h; y++){
4173             for(x=0; x<w; x++){
4174                 s->spatial_dwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
4175             }
4176         }
4177         predict_plane(s, s->spatial_dwt_buffer, plane_index, 0);
4178
4179         if(   plane_index==0
4180            && pict->pict_type == P_TYPE
4181            && !(avctx->flags&CODEC_FLAG_PASS2)
4182            && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
4183             ff_init_range_encoder(c, buf, buf_size);
4184             ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4185             pict->pict_type= FF_I_TYPE;
4186             s->keyframe=1;
4187             s->current_picture.key_frame=1;
4188             goto redo_frame;
4189         }
4190
4191         if(s->qlog == LOSSLESS_QLOG){
4192             for(y=0; y<h; y++){
4193                 for(x=0; x<w; x++){
4194                     s->spatial_dwt_buffer[y*w + x]= (s->spatial_dwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
4195                 }
4196             }
4197         }
4198
4199         ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4200
4201         if(s->pass1_rc && plane_index==0){
4202             int delta_qlog = ratecontrol_1pass(s, pict);
4203             if (delta_qlog <= INT_MIN)
4204                 return -1;
4205             if(delta_qlog){
4206                 //reordering qlog in the bitstream would eliminate this reset
4207                 ff_init_range_encoder(c, buf, buf_size);
4208                 memcpy(s->header_state, rc_header_bak, sizeof(s->header_state));
4209                 memcpy(s->block_state, rc_block_bak, sizeof(s->block_state));
4210                 encode_header(s);
4211                 encode_blocks(s, 0);
4212             }
4213         }
4214
4215         for(level=0; level<s->spatial_decomposition_count; level++){
4216             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4217                 SubBand *b= &p->band[level][orientation];
4218
4219                 quantize(s, b, b->buf, b->stride, s->qbias);
4220                 if(orientation==0)
4221                     decorrelate(s, b, b->buf, b->stride, pict->pict_type == P_TYPE, 0);
4222                 encode_subband(s, b, b->buf, b->parent ? b->parent->buf : NULL, b->stride, orientation);
4223                 assert(b->parent==NULL || b->parent->stride == b->stride*2);
4224                 if(orientation==0)
4225                     correlate(s, b, b->buf, b->stride, 1, 0);
4226             }
4227         }
4228 //        av_log(NULL, AV_LOG_DEBUG, "plane:%d bits:%d\n", plane_index, put_bits_count(&s->c.pb) - bits);
4229
4230         for(level=0; level<s->spatial_decomposition_count; level++){
4231             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4232                 SubBand *b= &p->band[level][orientation];
4233
4234                 dequantize(s, b, b->buf, b->stride);
4235             }
4236         }
4237
4238         ff_spatial_idwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4239         if(s->qlog == LOSSLESS_QLOG){
4240             for(y=0; y<h; y++){
4241                 for(x=0; x<w; x++){
4242                     s->spatial_dwt_buffer[y*w + x]<<=FRAC_BITS;
4243                 }
4244             }
4245         }
4246 {START_TIMER
4247         predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
4248 STOP_TIMER("pred-conv")}
4249       }else{
4250             //ME/MC only
4251             if(pict->pict_type == I_TYPE){
4252                 for(y=0; y<h; y++){
4253                     for(x=0; x<w; x++){
4254                         s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x]=
4255                             pict->data[plane_index][y*pict->linesize[plane_index] + x];
4256                     }
4257                 }
4258             }else{
4259                 memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
4260                 predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
4261             }
4262       }
4263         if(s->avctx->flags&CODEC_FLAG_PSNR){
4264             int64_t error= 0;
4265
4266     if(pict->data[plane_index]) //FIXME gray hack
4267             for(y=0; y<h; y++){
4268                 for(x=0; x<w; x++){
4269                     int d= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
4270                     error += d*d;
4271                 }
4272             }
4273             s->avctx->error[plane_index] += error;
4274             s->current_picture.error[plane_index] = error;
4275         }
4276     }
4277
4278     if(s->last_picture[s->max_ref_frames-1].data[0])
4279         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4280
4281     s->current_picture.coded_picture_number = avctx->frame_number;
4282     s->current_picture.pict_type = pict->pict_type;
4283     s->current_picture.quality = pict->quality;
4284     s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4285     s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
4286     s->m.current_picture.display_picture_number =
4287     s->m.current_picture.coded_picture_number = avctx->frame_number;
4288     s->m.current_picture.quality = pict->quality;
4289     s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
4290     if(s->pass1_rc)
4291         if (ff_rate_estimate_qscale(&s->m, 0) < 0)
4292             return -1;
4293     if(avctx->flags&CODEC_FLAG_PASS1)
4294         ff_write_pass1_stats(&s->m);
4295     s->m.last_pict_type = s->m.pict_type;
4296     avctx->frame_bits = s->m.frame_bits;
4297     avctx->mv_bits = s->m.mv_bits;
4298     avctx->misc_bits = s->m.misc_bits;
4299     avctx->p_tex_bits = s->m.p_tex_bits;
4300
4301     emms_c();
4302
4303     return ff_rac_terminate(c);
4304 }
4305
4306 static void common_end(SnowContext *s){
4307     int plane_index, level, orientation, i;
4308
4309     av_freep(&s->spatial_dwt_buffer);
4310
4311     av_freep(&s->m.me.scratchpad);
4312     av_freep(&s->m.me.map);
4313     av_freep(&s->m.me.score_map);
4314     av_freep(&s->m.obmc_scratchpad);
4315
4316     av_freep(&s->block);
4317
4318     for(i=0; i<MAX_REF_FRAMES; i++){
4319         av_freep(&s->ref_mvs[i]);
4320         av_freep(&s->ref_scores[i]);
4321         if(s->last_picture[i].data[0])
4322             s->avctx->release_buffer(s->avctx, &s->last_picture[i]);
4323     }
4324
4325     for(plane_index=0; plane_index<3; plane_index++){
4326         for(level=s->spatial_decomposition_count-1; level>=0; level--){
4327             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4328                 SubBand *b= &s->plane[plane_index].band[level][orientation];
4329
4330                 av_freep(&b->x_coeff);
4331             }
4332         }
4333     }
4334 }
4335
4336 static int encode_end(AVCodecContext *avctx)
4337 {
4338     SnowContext *s = avctx->priv_data;
4339
4340     common_end(s);
4341     av_free(avctx->stats_out);
4342
4343     return 0;
4344 }
4345
4346 static int decode_init(AVCodecContext *avctx)
4347 {
4348     SnowContext *s = avctx->priv_data;
4349     int block_size;
4350
4351     avctx->pix_fmt= PIX_FMT_YUV420P;
4352
4353     common_init(avctx);
4354
4355     block_size = MB_SIZE >> s->block_max_depth;
4356     slice_buffer_init(&s->sb, s->plane[0].height, (block_size) + (s->spatial_decomposition_count * (s->spatial_decomposition_count + 3)) + 1, s->plane[0].width, s->spatial_dwt_buffer);
4357
4358     return 0;
4359 }
4360
4361 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
4362     SnowContext *s = avctx->priv_data;
4363     RangeCoder * const c= &s->c;
4364     int bytes_read;
4365     AVFrame *picture = data;
4366     int level, orientation, plane_index;
4367
4368     ff_init_range_decoder(c, buf, buf_size);
4369     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4370
4371     s->current_picture.pict_type= FF_I_TYPE; //FIXME I vs. P
4372     decode_header(s);
4373     if(!s->block) alloc_blocks(s);
4374
4375     frame_start(s);
4376     //keyframe flag dupliaction mess FIXME
4377     if(avctx->debug&FF_DEBUG_PICT_INFO)
4378         av_log(avctx, AV_LOG_ERROR, "keyframe:%d qlog:%d\n", s->keyframe, s->qlog);
4379
4380     decode_blocks(s);
4381
4382     for(plane_index=0; plane_index<3; plane_index++){
4383         Plane *p= &s->plane[plane_index];
4384         int w= p->width;
4385         int h= p->height;
4386         int x, y;
4387         int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
4388
4389 if(s->avctx->debug&2048){
4390         memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
4391         predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
4392
4393         for(y=0; y<h; y++){
4394             for(x=0; x<w; x++){
4395                 int v= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x];
4396                 s->mconly_picture.data[plane_index][y*s->mconly_picture.linesize[plane_index] + x]= v;
4397             }
4398         }
4399 }
4400
4401 {   START_TIMER
4402     for(level=0; level<s->spatial_decomposition_count; level++){
4403         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4404             SubBand *b= &p->band[level][orientation];
4405             unpack_coeffs(s, b, b->parent, orientation);
4406         }
4407     }
4408     STOP_TIMER("unpack coeffs");
4409 }
4410
4411 {START_TIMER
4412     const int mb_h= s->b_height << s->block_max_depth;
4413     const int block_size = MB_SIZE >> s->block_max_depth;
4414     const int block_w    = plane_index ? block_size/2 : block_size;
4415     int mb_y;
4416     dwt_compose_t cs[MAX_DECOMPOSITIONS];
4417     int yd=0, yq=0;
4418     int y;
4419     int end_y;
4420
4421     ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
4422     for(mb_y=0; mb_y<=mb_h; mb_y++){
4423
4424         int slice_starty = block_w*mb_y;
4425         int slice_h = block_w*(mb_y+1);
4426         if (!(s->keyframe || s->avctx->debug&512)){
4427             slice_starty = FFMAX(0, slice_starty - (block_w >> 1));
4428             slice_h -= (block_w >> 1);
4429         }
4430
4431         {
4432         START_TIMER
4433         for(level=0; level<s->spatial_decomposition_count; level++){
4434             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4435                 SubBand *b= &p->band[level][orientation];
4436                 int start_y;
4437                 int end_y;
4438                 int our_mb_start = mb_y;
4439                 int our_mb_end = (mb_y + 1);
4440                 const int extra= 3;
4441                 start_y = (mb_y ? ((block_w * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
4442                 end_y = (((block_w * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
4443                 if (!(s->keyframe || s->avctx->debug&512)){
4444                     start_y = FFMAX(0, start_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4445                     end_y = FFMAX(0, end_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4446                 }
4447                 start_y = FFMIN(b->height, start_y);
4448                 end_y = FFMIN(b->height, end_y);
4449
4450                 if (start_y != end_y){
4451                     if (orientation == 0){
4452                         SubBand * correlate_band = &p->band[0][0];
4453                         int correlate_end_y = FFMIN(b->height, end_y + 1);
4454                         int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
4455                         decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
4456                         correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
4457                         dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride, start_y, end_y);
4458                     }
4459                     else
4460                         decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
4461                 }
4462             }
4463         }
4464         STOP_TIMER("decode_subband_slice");
4465         }
4466
4467 {   START_TIMER
4468         for(; yd<slice_h; yd+=4){
4469             ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
4470         }
4471     STOP_TIMER("idwt slice");}
4472
4473
4474         if(s->qlog == LOSSLESS_QLOG){
4475             for(; yq<slice_h && yq<h; yq++){
4476                 DWTELEM * line = slice_buffer_get_line(&s->sb, yq);
4477                 for(x=0; x<w; x++){
4478                     line[x] <<= FRAC_BITS;
4479                 }
4480             }
4481         }
4482
4483         predict_slice_buffered(s, &s->sb, s->spatial_dwt_buffer, plane_index, 1, mb_y);
4484
4485         y = FFMIN(p->height, slice_starty);
4486         end_y = FFMIN(p->height, slice_h);
4487         while(y < end_y)
4488             slice_buffer_release(&s->sb, y++);
4489     }
4490
4491     slice_buffer_flush(&s->sb);
4492
4493 STOP_TIMER("idwt + predict_slices")}
4494     }
4495
4496     emms_c();
4497
4498     if(s->last_picture[s->max_ref_frames-1].data[0])
4499         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4500
4501 if(!(s->avctx->debug&2048))
4502     *picture= s->current_picture;
4503 else
4504     *picture= s->mconly_picture;
4505
4506     *data_size = sizeof(AVFrame);
4507
4508     bytes_read= c->bytestream - c->bytestream_start;
4509     if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
4510
4511     return bytes_read;
4512 }
4513
4514 static int decode_end(AVCodecContext *avctx)
4515 {
4516     SnowContext *s = avctx->priv_data;
4517
4518     slice_buffer_destroy(&s->sb);
4519
4520     common_end(s);
4521
4522     return 0;
4523 }
4524
4525 AVCodec snow_decoder = {
4526     "snow",
4527     CODEC_TYPE_VIDEO,
4528     CODEC_ID_SNOW,
4529     sizeof(SnowContext),
4530     decode_init,
4531     NULL,
4532     decode_end,
4533     decode_frame,
4534     0 /*CODEC_CAP_DR1*/ /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
4535     NULL
4536 };
4537
4538 #ifdef CONFIG_SNOW_ENCODER
4539 AVCodec snow_encoder = {
4540     "snow",
4541     CODEC_TYPE_VIDEO,
4542     CODEC_ID_SNOW,
4543     sizeof(SnowContext),
4544     encode_init,
4545     encode_frame,
4546     encode_end,
4547 };
4548 #endif
4549
4550
4551 #if 0
4552 #undef malloc
4553 #undef free
4554 #undef printf
4555 #undef random
4556
4557 int main(){
4558     int width=256;
4559     int height=256;
4560     int buffer[2][width*height];
4561     SnowContext s;
4562     int i;
4563     s.spatial_decomposition_count=6;
4564     s.spatial_decomposition_type=1;
4565
4566     printf("testing 5/3 DWT\n");
4567     for(i=0; i<width*height; i++)
4568         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4569
4570     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4571     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4572
4573     for(i=0; i<width*height; i++)
4574         if(buffer[0][i]!= buffer[1][i]) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4575
4576     printf("testing 9/7 DWT\n");
4577     s.spatial_decomposition_type=0;
4578     for(i=0; i<width*height; i++)
4579         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4580
4581     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4582     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4583
4584     for(i=0; i<width*height; i++)
4585         if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4586
4587 #if 0
4588     printf("testing AC coder\n");
4589     memset(s.header_state, 0, sizeof(s.header_state));
4590     ff_init_range_encoder(&s.c, buffer[0], 256*256);
4591     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4592
4593     for(i=-256; i<256; i++){
4594 START_TIMER
4595         put_symbol(&s.c, s.header_state, i*i*i/3*FFABS(i), 1);
4596 STOP_TIMER("put_symbol")
4597     }
4598     ff_rac_terminate(&s.c);
4599
4600     memset(s.header_state, 0, sizeof(s.header_state));
4601     ff_init_range_decoder(&s.c, buffer[0], 256*256);
4602     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4603
4604     for(i=-256; i<256; i++){
4605         int j;
4606 START_TIMER
4607         j= get_symbol(&s.c, s.header_state, 1);
4608 STOP_TIMER("get_symbol")
4609         if(j!=i*i*i/3*FFABS(i)) printf("fsck: %d != %d\n", i, j);
4610     }
4611 #endif
4612 {
4613 int level, orientation, x, y;
4614 int64_t errors[8][4];
4615 int64_t g=0;
4616
4617     memset(errors, 0, sizeof(errors));
4618     s.spatial_decomposition_count=3;
4619     s.spatial_decomposition_type=0;
4620     for(level=0; level<s.spatial_decomposition_count; level++){
4621         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4622             int w= width  >> (s.spatial_decomposition_count-level);
4623             int h= height >> (s.spatial_decomposition_count-level);
4624             int stride= width  << (s.spatial_decomposition_count-level);
4625             DWTELEM *buf= buffer[0];
4626             int64_t error=0;
4627
4628             if(orientation&1) buf+=w;
4629             if(orientation>1) buf+=stride>>1;
4630
4631             memset(buffer[0], 0, sizeof(int)*width*height);
4632             buf[w/2 + h/2*stride]= 256*256;
4633             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4634             for(y=0; y<height; y++){
4635                 for(x=0; x<width; x++){
4636                     int64_t d= buffer[0][x + y*width];
4637                     error += d*d;
4638                     if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9 && level==2) printf("%8"PRId64" ", d);
4639                 }
4640                 if(FFABS(height/2-y)<9 && level==2) printf("\n");
4641             }
4642             error= (int)(sqrt(error)+0.5);
4643             errors[level][orientation]= error;
4644             if(g) g=ff_gcd(g, error);
4645             else g= error;
4646         }
4647     }
4648     printf("static int const visual_weight[][4]={\n");
4649     for(level=0; level<s.spatial_decomposition_count; level++){
4650         printf("  {");
4651         for(orientation=0; orientation<4; orientation++){
4652             printf("%8"PRId64",", errors[level][orientation]/g);
4653         }
4654         printf("},\n");
4655     }
4656     printf("};\n");
4657     {
4658             int level=2;
4659             int orientation=3;
4660             int w= width  >> (s.spatial_decomposition_count-level);
4661             int h= height >> (s.spatial_decomposition_count-level);
4662             int stride= width  << (s.spatial_decomposition_count-level);
4663             DWTELEM *buf= buffer[0];
4664             int64_t error=0;
4665
4666             buf+=w;
4667             buf+=stride>>1;
4668
4669             memset(buffer[0], 0, sizeof(int)*width*height);
4670 #if 1
4671             for(y=0; y<height; y++){
4672                 for(x=0; x<width; x++){
4673                     int tab[4]={0,2,3,1};
4674                     buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
4675                 }
4676             }
4677             ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4678 #else
4679             for(y=0; y<h; y++){
4680                 for(x=0; x<w; x++){
4681                     buf[x + y*stride  ]=169;
4682                     buf[x + y*stride-w]=64;
4683                 }
4684             }
4685             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4686 #endif
4687             for(y=0; y<height; y++){
4688                 for(x=0; x<width; x++){
4689                     int64_t d= buffer[0][x + y*width];
4690                     error += d*d;
4691                     if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9) printf("%8"PRId64" ", d);
4692                 }
4693                 if(FFABS(height/2-y)<9) printf("\n");
4694             }
4695     }
4696
4697 }
4698     return 0;
4699 }
4700 #endif
4701