chrome/browser/safe_browsing/safe_browsing_store_file.h

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
   6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
   7
   8 #include <set>
   9 #include <vector>
  10
  11 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
  12
  13 #include "base/callback.h"
  14 #include "base/files/file_path.h"
  15 #include "base/files/scoped_file.h"
  16
  17 // Implement SafeBrowsingStore in terms of a flat file.  The file
  18 // format is pretty literal:
  19 //
  20 // int32 magic;             // magic number "validating" file
  21 // int32 version;           // format version
  22 //
  23 // // Counts for the various data which follows the header.
  24 // uint32 add_chunk_count;  // Chunks seen, including empties.
  25 // uint32 sub_chunk_count;  // Ditto.
  26 // uint32 shard_stride;     // SBPrefix space covered per shard.
  27 //                          // 0==entire space in one shard.
  28 // // Sorted by chunk_id.
  29 // array[add_chunk_count] {
  30 //   int32 chunk_id;
  31 // }
  32 // // Sorted by chunk_id.
  33 // array[sub_chunk_count] {
  34 //   int32 chunk_id;
  35 // }
  36 // MD5Digest header_checksum;  // Checksum over preceeding data.
  37 //
  38 // // Sorted by prefix, then add chunk_id, then hash, both within shards and
  39 // // overall.
  40 // array[from 0 to wraparound to 0 by shard_stride] {
  41 //   uint32 add_prefix_count;
  42 //   uint32 sub_prefix_count;
  43 //   uint32 add_hash_count;
  44 //   uint32 sub_hash_count;
  45 //   array[add_prefix_count] {
  46 //     int32 chunk_id;
  47 //     uint32 prefix;
  48 //   }
  49 //   array[sub_prefix_count] {
  50 //     int32 chunk_id;
  51 //     int32 add_chunk_id;
  52 //     uint32 add_prefix;
  53 //   }
  54 //   array[add_hash_count] {
  55 //     int32 chunk_id;
  56 //     int32 received_time;     // From base::Time::ToTimeT().
  57 //     char[32] full_hash;
  58 //   }
  59 //   array[sub_hash_count] {
  60 //     int32 chunk_id;
  61 //     int32 add_chunk_id;
  62 //     char[32] add_full_hash;
  63 //   }
  64 // }
  65 // MD5Digest checksum;      // Checksum over entire file.
  66 //
  67 // The checksums are used to allow writing the file without doing an expensive
  68 // fsync().  Since the data can be re-fetched, failing the checksum is not
  69 // catastrophic.  Histograms indicate that file corruption here is pretty
  70 // uncommon.
  71 //
  72 // The |header_checksum| is present to guarantee valid header and chunk data for
  73 // updates.  Only that part of the file needs to be read to post the update.
  74 //
  75 // |shard_stride| breaks the file into approximately-equal portions, allowing
  76 // updates to stream from one file to another with modest memory usage.  It is
  77 // dynamic to adjust to different file sizes without adding excessive overhead.
  78 //
  79 // During the course of an update, uncommitted data is stored in a
  80 // temporary file (which is later re-used to commit).  This is an
  81 // array of chunks, with the count kept in memory until the end of the
  82 // transaction.  The format of this file is like the main file, with
  83 // the list of chunks seen omitted, as that data is tracked in-memory:
  84 //
  85 // array[] {
  86 //   uint32 add_prefix_count;
  87 //   uint32 sub_prefix_count;
  88 //   uint32 add_hash_count;
  89 //   uint32 sub_hash_count;
  90 //   array[add_prefix_count] {
  91 //     int32 chunk_id;
  92 //     uint32 prefix;
  93 //   }
  94 //   array[sub_prefix_count] {
  95 //     int32 chunk_id;
  96 //     int32 add_chunk_id;
  97 //     uint32 add_prefix;
  98 //   }
  99 //   array[add_hash_count] {
 100 //     int32 chunk_id;
 101 //     int32 received_time;     // From base::Time::ToTimeT().
 102 //     char[32] full_hash;
 103 //   }
 104 //   array[sub_hash_count] {
 105 //     int32 chunk_id;
 106 //     int32 add_chunk_id;
 107 //     char[32] add_full_hash;
 108 //   }
 109 // }
 110 //
 111 // The overall transaction works like this:
 112 // - Open the original file to get the chunks-seen data.
 113 // - Open a temp file for storing new chunk info.
 114 // - Write new chunks to the temp file.
 115 // - When the transaction is finished:
 116 //   - Read the update data from the temp file into memory.
 117 //   - Overwrite the temp file with new header data.
 118 //   - Until done:
 119 //     - Read shards of the original file's data into memory.
 120 //     - Merge from the update data.
 121 //     - Write shards to the temp file.
 122 //   - Delete original file.
 123 //   - Rename temp file to original filename.
 124
 125 class SafeBrowsingStoreFile : public SafeBrowsingStore {
 126  public:
 127   SafeBrowsingStoreFile();
 128   virtual ~SafeBrowsingStoreFile();
 129
 130   virtual void Init(const base::FilePath& filename,
 131                     const base::Closure& corruption_callback) OVERRIDE;
 132
 133   // Delete any on-disk files, including the permanent storage.
 134   virtual bool Delete() OVERRIDE;
 135
 136   // Get all add hash prefixes and full-length hashes, respectively, from
 137   // the store.
 138   virtual bool GetAddPrefixes(SBAddPrefixes* add_prefixes) OVERRIDE;
 139   virtual bool GetAddFullHashes(
 140       std::vector<SBAddFullHash>* add_full_hashes) OVERRIDE;
 141
 142   virtual bool BeginChunk() OVERRIDE;
 143
 144   virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) OVERRIDE;
 145   virtual bool WriteAddHash(int32 chunk_id,
 146                             base::Time receive_time,
 147                             const SBFullHash& full_hash) OVERRIDE;
 148   virtual bool WriteSubPrefix(int32 chunk_id,
 149                               int32 add_chunk_id, SBPrefix prefix) OVERRIDE;
 150   virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
 151                             const SBFullHash& full_hash) OVERRIDE;
 152   virtual bool FinishChunk() OVERRIDE;
 153
 154   virtual bool BeginUpdate() OVERRIDE;
 155   virtual bool FinishUpdate(
 156       safe_browsing::PrefixSetBuilder* builder,
 157       std::vector<SBAddFullHash>* add_full_hashes_result) OVERRIDE;
 158   virtual bool CancelUpdate() OVERRIDE;
 159
 160   virtual void SetAddChunk(int32 chunk_id) OVERRIDE;
 161   virtual bool CheckAddChunk(int32 chunk_id) OVERRIDE;
 162   virtual void GetAddChunks(std::vector<int32>* out) OVERRIDE;
 163   virtual void SetSubChunk(int32 chunk_id) OVERRIDE;
 164   virtual bool CheckSubChunk(int32 chunk_id) OVERRIDE;
 165   virtual void GetSubChunks(std::vector<int32>* out) OVERRIDE;
 166
 167   virtual void DeleteAddChunk(int32 chunk_id) OVERRIDE;
 168   virtual void DeleteSubChunk(int32 chunk_id) OVERRIDE;
 169
 170   // Verify |file_|'s checksum, calling the corruption callback if it
 171   // does not check out.  Empty input is considered valid.
 172   virtual bool CheckValidity() OVERRIDE;
 173
 174   // Returns the name of the temporary file used to buffer data for
 175   // |filename|.  Exported for unit tests.
 176   static const base::FilePath TemporaryFileForFilename(
 177       const base::FilePath& filename) {
 178     return base::FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
 179   }
 180
 181   // Delete any on-disk files, including the permanent storage.
 182   static bool DeleteStore(const base::FilePath& basename);
 183
 184  private:
 185   // Does the actual update for FinishUpdate(), so that FinishUpdate() can clean
 186   // up correctly in case of error.
 187   virtual bool DoUpdate(safe_browsing::PrefixSetBuilder* builder,
 188                         std::vector<SBAddFullHash>* add_full_hashes_result);
 189
 190   // Some very lucky users have an original-format file still in their
 191   // profile.  Check for it and delete, recording a histogram for the
 192   // result (no histogram for not-found).  Logically this
 193   // would make more sense at the SafeBrowsingDatabase level, but
 194   // practically speaking that code doesn't touch files directly.
 195   static void CheckForOriginalAndDelete(const base::FilePath& filename);
 196
 197   // Close all files and clear all buffers.
 198   bool Close();
 199
 200   // Calls |corruption_callback_| if non-NULL, always returns false as
 201   // a convenience to the caller.
 202   bool OnCorruptDatabase();
 203
 204   // Helper for creating a corruption callback for |old_store_|.
 205   // TODO(shess): Remove after migration.
 206   void HandleCorruptDatabase();
 207
 208   // Clear temporary buffers used to accumulate chunk data.
 209   bool ClearChunkBuffers() {
 210     // NOTE: .clear() doesn't release memory.
 211     // TODO(shess): Figure out if this is overkill.  Some amount of
 212     // pre-reserved space is probably reasonable between each chunk
 213     // collected.
 214     SBAddPrefixes().swap(add_prefixes_);
 215     SBSubPrefixes().swap(sub_prefixes_);
 216     std::vector<SBAddFullHash>().swap(add_hashes_);
 217     std::vector<SBSubFullHash>().swap(sub_hashes_);
 218     return true;
 219   }
 220
 221   // Clear all buffers used during update.
 222   void ClearUpdateBuffers() {
 223     ClearChunkBuffers();
 224     chunks_written_ = 0;
 225     std::set<int32>().swap(add_chunks_cache_);
 226     std::set<int32>().swap(sub_chunks_cache_);
 227     base::hash_set<int32>().swap(add_del_cache_);
 228     base::hash_set<int32>().swap(sub_del_cache_);
 229   }
 230
 231   // Buffers for collecting data between BeginChunk() and
 232   // FinishChunk().
 233   SBAddPrefixes add_prefixes_;
 234   SBSubPrefixes sub_prefixes_;
 235   std::vector<SBAddFullHash> add_hashes_;
 236   std::vector<SBSubFullHash> sub_hashes_;
 237
 238   // Count of chunks collected in |new_file_|.
 239   int chunks_written_;
 240
 241   // Name of the main database file.
 242   base::FilePath filename_;
 243
 244   // Handles to the main and scratch files.  |empty_| is true if the
 245   // main file didn't exist when the update was started.
 246   base::ScopedFILE file_;
 247   base::ScopedFILE new_file_;
 248   bool empty_;
 249
 250   // Cache of chunks which have been seen.  Loaded from the database
 251   // on BeginUpdate() so that it can be queried during the
 252   // transaction.
 253   std::set<int32> add_chunks_cache_;
 254   std::set<int32> sub_chunks_cache_;
 255
 256   // Cache the set of deleted chunks during a transaction, applied on
 257   // FinishUpdate().
 258   // TODO(shess): If the set is small enough, hash_set<> might be
 259   // slower than plain set<>.
 260   base::hash_set<int32> add_del_cache_;
 261   base::hash_set<int32> sub_del_cache_;
 262
 263   base::Closure corruption_callback_;
 264
 265   // Tracks whether corruption has already been seen in the current
 266   // update, so that only one instance is recorded in the stats.
 267   // TODO(shess): Remove with format-migration support.
 268   bool corruption_seen_;
 269
 270   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
 271 };
 272
 273 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_