gfx/thebes/gfxAlphaRecoveryGeneric.h

   1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
   2  * This Source Code Form is subject to the terms of the Mozilla Public
   3  * License, v. 2.0. If a copy of the MPL was not distributed with this
   4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   5 #ifndef _GFXALPHARECOVERY_GENERIC_H_
   6 #define _GFXALPHARECOVERY_GENERIC_H_
   7
   8 #include "gfxAlphaRecovery.h"
   9 #include "gfxImageSurface.h"
  10 #include "nsDebug.h"
  11 #include <xsimd/xsimd.hpp>
  12
  13 template <typename Arch>
  14 bool gfxAlphaRecovery::RecoverAlphaGeneric(gfxImageSurface* blackSurf,
  15                                            const gfxImageSurface* whiteSurf) {
  16   mozilla::gfx::IntSize size = blackSurf->GetSize();
  17
  18   if (size != whiteSurf->GetSize() ||
  19       (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
  20        blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
  21       (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
  22        whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
  23     return false;
  24
  25   blackSurf->Flush();
  26   whiteSurf->Flush();
  27
  28   unsigned char* blackData = blackSurf->Data();
  29   unsigned char* whiteData = whiteSurf->Data();
  30
  31   if ((NS_PTR_TO_UINT32(blackData) & 0xf) !=
  32           (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
  33       (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
  34     // Cannot keep these in alignment.
  35     return false;
  36   }
  37
  38   alignas(Arch::alignment()) static const uint8_t greenMaski[] = {
  39       0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
  40       0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
  41   };
  42   alignas(Arch::alignment()) static const uint8_t alphaMaski[] = {
  43       0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
  44       0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
  45   };
  46
  47   using batch_type = xsimd::batch<uint8_t, Arch>;
  48   constexpr size_t batch_size = batch_type::size;
  49   static_assert(batch_size == 16);
  50
  51   batch_type greenMask = batch_type::load_aligned(greenMaski);
  52   batch_type alphaMask = batch_type::load_aligned(alphaMaski);
  53
  54   for (int32_t i = 0; i < size.height; ++i) {
  55     int32_t j = 0;
  56     // Loop single pixels until at 4 byte alignment.
  57     while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
  58       *((uint32_t*)blackData) =
  59           RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
  60                        *reinterpret_cast<uint32_t*>(whiteData));
  61       blackData += 4;
  62       whiteData += 4;
  63       j++;
  64     }
  65     // This extra loop allows the compiler to do some more clever registry
  66     // management and makes it about 5% faster than with only the 4 pixel
  67     // at a time loop.
  68     for (; j < size.width - 8; j += 8) {
  69       auto black1 = batch_type::load_aligned(blackData);
  70       auto white1 = batch_type::load_aligned(whiteData);
  71       auto black2 = batch_type::load_aligned(blackData + batch_size);
  72       auto white2 = batch_type::load_aligned(whiteData + batch_size);
  73
  74       // Execute the same instructions as described in RecoverPixel, only
  75       // using an SSE2 packed saturated subtract.
  76       white1 = xsimd::ssub(white1, black1);
  77       white2 = xsimd::ssub(white2, black2);
  78       white1 = xsimd::ssub(greenMask, white1);
  79       white2 = xsimd::ssub(greenMask, white2);
  80       // Producing the final black pixel in an XMM register and storing
  81       // that is actually faster than doing a masked store since that
  82       // does an unaligned storage. We have the black pixel in a register
  83       // anyway.
  84       black1 = xsimd::bitwise_andnot(black1, alphaMask);
  85       black2 = xsimd::bitwise_andnot(black2, alphaMask);
  86       white1 = xsimd::slide_left<2>(white1);
  87       white2 = xsimd::slide_left<2>(white2);
  88       white1 &= alphaMask;
  89       white2 &= alphaMask;
  90       black1 |= white1;
  91       black2 |= white2;
  92
  93       black1.store_aligned(blackData);
  94       black2.store_aligned(blackData + batch_size);
  95       blackData += 2 * batch_size;
  96       whiteData += 2 * batch_size;
  97     }
  98     for (; j < size.width - 4; j += 4) {
  99       auto black = batch_type::load_aligned(blackData);
 100       auto white = batch_type::load_aligned(whiteData);
 101
 102       white = xsimd::ssub(white, black);
 103       white = xsimd::ssub(greenMask, white);
 104       black = xsimd::bitwise_andnot(black, alphaMask);
 105       white = xsimd::slide_left<2>(white);
 106       white &= alphaMask;
 107       black |= white;
 108       black.store_aligned(blackData);
 109       blackData += batch_size;
 110       whiteData += batch_size;
 111     }
 112     // Loop single pixels until we're done.
 113     while (j < size.width) {
 114       *((uint32_t*)blackData) =
 115           RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
 116                        *reinterpret_cast<uint32_t*>(whiteData));
 117       blackData += 4;
 118       whiteData += 4;
 119       j++;
 120     }
 121     blackData += blackSurf->Stride() - j * 4;
 122     whiteData += whiteSurf->Stride() - j * 4;
 123   }
 124
 125   blackSurf->MarkDirty();
 126
 127   return true;
 128 }
 129 #endif