third_party/dav1d/src/loopfilter_tmpl.c

   1 /*
   2  * Copyright © 2018, VideoLAN and dav1d authors
   3  * Copyright © 2018, Two Orioles, LLC
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright notice, this
  10  *    list of conditions and the following disclaimer.
  11  *
  12  * 2. Redistributions in binary form must reproduce the above copyright notice,
  13  *    this list of conditions and the following disclaimer in the documentation
  14  *    and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26  */
  27
  28 #include "config.h"
  29
  30 #include <stdlib.h>
  31
  32 #include "common/attributes.h"
  33 #include "common/intops.h"
  34
  35 #include "src/loopfilter.h"
  36
  37 static NOINLINE void
  38 loop_filter(pixel *dst, int E, int I, int H,
  39             const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd
  40             HIGHBD_DECL_SUFFIX)
  41 {
  42     const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
  43     const int F = 1 << bitdepth_min_8;
  44     E <<= bitdepth_min_8;
  45     I <<= bitdepth_min_8;
  46     H <<= bitdepth_min_8;
  47
  48     for (int i = 0; i < 4; i++, dst += stridea) {
  49         int p6, p5, p4, p3, p2;
  50         int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
  51         int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
  52         int q2, q3, q4, q5, q6;
  53         int fm, flat8out, flat8in;
  54
  55         fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
  56              abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
  57
  58         if (wd > 4) {
  59             p2 = dst[strideb * -3];
  60             q2 = dst[strideb * +2];
  61
  62             fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;
  63
  64             if (wd > 6) {
  65                 p3 = dst[strideb * -4];
  66                 q3 = dst[strideb * +3];
  67
  68                 fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
  69             }
  70         }
  71         if (!fm) continue;
  72
  73         if (wd >= 16) {
  74             p6 = dst[strideb * -7];
  75             p5 = dst[strideb * -6];
  76             p4 = dst[strideb * -5];
  77             q4 = dst[strideb * +4];
  78             q5 = dst[strideb * +5];
  79             q6 = dst[strideb * +6];
  80
  81             flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
  82                        abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
  83                        abs(q5 - q0) <= F && abs(q6 - q0) <= F;
  84         }
  85
  86         if (wd >= 6)
  87             flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
  88                       abs(q1 - q0) <= F && abs(q2 - q0) <= F;
  89
  90         if (wd >= 8)
  91             flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;
  92
  93         if (wd >= 16 && (flat8out & flat8in)) {
  94             dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
  95                                  p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
  96             dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
  97                                  p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
  98             dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
  99                                  p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
 100             dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
 101                                  p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
 102             dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
 103                                  p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
 104             dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
 105                                  q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
 106             dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
 107                                  q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
 108             dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
 109                                  q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
 110             dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
 111                                  q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
 112             dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
 113                                  q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
 114             dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
 115                                  q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
 116             dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
 117                                  q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
 118         } else if (wd >= 8 && flat8in) {
 119             dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
 120             dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
 121             dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
 122             dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
 123             dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
 124             dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
 125         } else if (wd == 6 && flat8in) {
 126             dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
 127             dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
 128             dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
 129             dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
 130         } else {
 131             const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
 132
 133 #define iclip_diff(v) iclip(v, -128 * (1 << bitdepth_min_8), \
 134                                 128 * (1 << bitdepth_min_8) - 1)
 135
 136             if (hev) {
 137                 int f = iclip_diff(p1 - q1), f1, f2;
 138                 f = iclip_diff(3 * (q0 - p0) + f);
 139
 140                 f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
 141                 f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
 142
 143                 dst[strideb * -1] = iclip_pixel(p0 + f2);
 144                 dst[strideb * +0] = iclip_pixel(q0 - f1);
 145             } else {
 146                 int f = iclip_diff(3 * (q0 - p0)), f1, f2;
 147
 148                 f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
 149                 f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
 150
 151                 dst[strideb * -1] = iclip_pixel(p0 + f2);
 152                 dst[strideb * +0] = iclip_pixel(q0 - f1);
 153
 154                 f = (f1 + 1) >> 1;
 155                 dst[strideb * -2] = iclip_pixel(p1 + f);
 156                 dst[strideb * +1] = iclip_pixel(q1 - f);
 157             }
 158 #undef iclip_diff
 159         }
 160     }
 161 }
 162
 163 static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
 164                                    const uint32_t *const vmask,
 165                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
 166                                    const Av1FilterLUT *lut, const int h
 167                                    HIGHBD_DECL_SUFFIX)
 168 {
 169     const unsigned vm = vmask[0] | vmask[1] | vmask[2];
 170     for (unsigned y = 1; vm & ~(y - 1);
 171          y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
 172     {
 173         if (vm & y) {
 174             const int L = l[0][0] ? l[0][0] : l[-1][0];
 175             if (!L) continue;
 176             const int H = L >> 4;
 177             const int E = lut->e[L], I = lut->i[L];
 178             const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
 179             loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx
 180                         HIGHBD_TAIL_SUFFIX);
 181         }
 182     }
 183 }
 184
 185 static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
 186                                    const uint32_t *const vmask,
 187                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
 188                                    const Av1FilterLUT *lut, const int w
 189                                    HIGHBD_DECL_SUFFIX)
 190 {
 191     const unsigned vm = vmask[0] | vmask[1] | vmask[2];
 192     for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
 193         if (vm & x) {
 194             const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
 195             if (!L) continue;
 196             const int H = L >> 4;
 197             const int E = lut->e[L], I = lut->i[L];
 198             const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
 199             loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx
 200                         HIGHBD_TAIL_SUFFIX);
 201         }
 202     }
 203 }
 204
 205 static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
 206                                     const uint32_t *const vmask,
 207                                     const uint8_t (*l)[4], ptrdiff_t b4_stride,
 208                                     const Av1FilterLUT *lut, const int h
 209                                     HIGHBD_DECL_SUFFIX)
 210 {
 211     const unsigned vm = vmask[0] | vmask[1];
 212     for (unsigned y = 1; vm & ~(y - 1);
 213          y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
 214     {
 215         if (vm & y) {
 216             const int L = l[0][0] ? l[0][0] : l[-1][0];
 217             if (!L) continue;
 218             const int H = L >> 4;
 219             const int E = lut->e[L], I = lut->i[L];
 220             const int idx = !!(vmask[1] & y);
 221             loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx
 222                         HIGHBD_TAIL_SUFFIX);
 223         }
 224     }
 225 }
 226
 227 static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
 228                                     const uint32_t *const vmask,
 229                                     const uint8_t (*l)[4], ptrdiff_t b4_stride,
 230                                     const Av1FilterLUT *lut, const int w
 231                                     HIGHBD_DECL_SUFFIX)
 232 {
 233     const unsigned vm = vmask[0] | vmask[1];
 234     for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
 235         if (vm & x) {
 236             const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
 237             if (!L) continue;
 238             const int H = L >> 4;
 239             const int E = lut->e[L], I = lut->i[L];
 240             const int idx = !!(vmask[1] & x);
 241             loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx
 242                         HIGHBD_TAIL_SUFFIX);
 243         }
 244     }
 245 }
 246
 247 #if HAVE_ASM
 248 #if ARCH_AARCH64 || ARCH_ARM
 249 #include "src/arm/loopfilter.h"
 250 #elif ARCH_LOONGARCH64
 251 #include "src/loongarch/loopfilter.h"
 252 #elif ARCH_X86
 253 #include "src/x86/loopfilter.h"
 254 #endif
 255 #endif
 256
 257 COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
 258     c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
 259     c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
 260     c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
 261     c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
 262
 263 #if HAVE_ASM
 264 #if ARCH_AARCH64 || ARCH_ARM
 265     loop_filter_dsp_init_arm(c);
 266 #elif ARCH_LOONGARCH64
 267     loop_filter_dsp_init_loongarch(c);
 268 #elif ARCH_X86
 269     loop_filter_dsp_init_x86(c);
 270 #endif
 271 #endif
 272 }