libsanitizer/sanitizer_common/sanitizer_lzw.h

   1 //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // Lempel–Ziv–Welch encoding/decoding
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef SANITIZER_LZW_H
  14 #define SANITIZER_LZW_H
  15
  16 #include "sanitizer_dense_map.h"
  17
  18 namespace __sanitizer {
  19
  20 using LzwCodeType = u32;
  21
  22 template <class T, class ItIn, class ItOut>
  23 ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
  24   using Substring =
  25       detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
  26
  27   // Sentinel value for substrings of len 1.
  28   static constexpr LzwCodeType kNoPrefix =
  29       Min(DenseMapInfo<Substring>::getEmptyKey().first,
  30           DenseMapInfo<Substring>::getTombstoneKey().first) -
  31       1;
  32   DenseMap<Substring, LzwCodeType> prefix_to_code;
  33   {
  34     // Add all substring of len 1 as initial dictionary.
  35     InternalMmapVector<T> dict_len1;
  36     for (auto it = begin; it != end; ++it)
  37       if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
  38         dict_len1.push_back(*it);
  39
  40     // Slightly helps with later delta encoding.
  41     Sort(dict_len1.data(), dict_len1.size());
  42
  43     // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
  44     // just generate them.
  45     *out = dict_len1.size();
  46     ++out;
  47
  48     for (uptr i = 0; i != dict_len1.size(); ++i) {
  49       // Remap after the Sort.
  50       prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
  51       *out = dict_len1[i];
  52       ++out;
  53     }
  54     CHECK_EQ(prefix_to_code.size(), dict_len1.size());
  55   }
  56
  57   if (begin == end)
  58     return out;
  59
  60   // Main LZW encoding loop.
  61   LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
  62   ++begin;
  63   for (auto it = begin; it != end; ++it) {
  64     // Extend match with the new item.
  65     auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
  66     if (ins.second) {
  67       // This is a new substring, but emit the code for the current match
  68       // (before extend). This allows LZW decoder to recover the dictionary.
  69       *out = match;
  70       ++out;
  71       // Reset the match to a single item, which must be already in the map.
  72       match = prefix_to_code.find({kNoPrefix, *it})->second;
  73     } else {
  74       // Already known, use as the current match.
  75       match = ins.first->second;
  76     }
  77   }
  78
  79   *out = match;
  80   ++out;
  81
  82   return out;
  83 }
  84
  85 template <class T, class ItIn, class ItOut>
  86 ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
  87   if (begin == end)
  88     return out;
  89
  90   // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
  91   InternalMmapVector<T> dict_len1(*begin);
  92   ++begin;
  93
  94   if (begin == end)
  95     return out;
  96
  97   for (auto& v : dict_len1) {
  98     v = *begin;
  99     ++begin;
 100   }
 101
 102   // Substrings of len 2 and up. Indexes are shifted because [0,
 103   // dict_len1.size()) stored in dict_len1. Substings get here after being
 104   // emitted to the output, so we can use output position.
 105   InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
 106       code_to_substr;
 107
 108   // Copies already emitted substrings into the output again.
 109   auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
 110     if (code < dict_len1.size()) {
 111       *out = dict_len1[code];
 112       ++out;
 113       return out;
 114     }
 115     const auto& s = code_to_substr[code - dict_len1.size()];
 116
 117     for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
 118     return out;
 119   };
 120
 121   // Returns lens of the substring with the given code.
 122   auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
 123     if (code < dict_len1.size())
 124       return 1;
 125     const auto& s = code_to_substr[code - dict_len1.size()];
 126     return s.second - s.first;
 127   };
 128
 129   // Main LZW decoding loop.
 130   LzwCodeType prev_code = *begin;
 131   ++begin;
 132   out = copy(prev_code, out);
 133   for (auto it = begin; it != end; ++it) {
 134     LzwCodeType code = *it;
 135     auto start = out;
 136     if (code == dict_len1.size() + code_to_substr.size()) {
 137       // Special LZW case. The code is not in the dictionary yet. This is
 138       // possible only when the new substring is the same as previous one plus
 139       // the first item of the previous substring. We can emit that in two
 140       // steps.
 141       out = copy(prev_code, out);
 142       *out = *start;
 143       ++out;
 144     } else {
 145       out = copy(code, out);
 146     }
 147
 148     // Every time encoded emits the code, it also creates substing of len + 1
 149     // including the first item of the just emmited substring. Do the same here.
 150     uptr len = code_to_len(prev_code);
 151     code_to_substr.push_back({start - len, start + 1});
 152
 153     prev_code = code;
 154   }
 155   return out;
 156 }
 157
 158 }  // namespace __sanitizer
 159 #endif