Refactor to simplify safe-browsing gethash cache.
[chromium-blink-merge.git] / chrome / browser / safe_browsing / safe_browsing_store_file.h
blobf54e39b335660f2f008ae0aee00cccc2ed5198c7
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
8 #include <set>
9 #include <vector>
11 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
13 #include "base/callback.h"
14 #include "base/files/file_path.h"
15 #include "base/files/scoped_file.h"
17 // Implement SafeBrowsingStore in terms of a flat file. The file
18 // format is pretty literal:
20 // int32 magic; // magic number "validating" file
21 // int32 version; // format version
23 // // Counts for the various data which follows the header.
24 // uint32 add_chunk_count; // Chunks seen, including empties.
25 // uint32 sub_chunk_count; // Ditto.
26 // uint32 shard_stride; // SBPrefix space covered per shard.
27 // // 0==entire space in one shard.
28 // // Sorted by chunk_id.
29 // array[add_chunk_count] {
30 // int32 chunk_id;
31 // }
32 // // Sorted by chunk_id.
33 // array[sub_chunk_count] {
34 // int32 chunk_id;
35 // }
36 // MD5Digest header_checksum; // Checksum over preceeding data.
38 // // Sorted by prefix, then add chunk_id, then hash, both within shards and
39 // // overall.
40 // array[from 0 to wraparound to 0 by shard_stride] {
41 // uint32 add_prefix_count;
42 // uint32 sub_prefix_count;
43 // uint32 add_hash_count;
44 // uint32 sub_hash_count;
45 // array[add_prefix_count] {
46 // int32 chunk_id;
47 // uint32 prefix;
48 // }
49 // array[sub_prefix_count] {
50 // int32 chunk_id;
51 // int32 add_chunk_id;
52 // uint32 add_prefix;
53 // }
54 // array[add_hash_count] {
55 // int32 chunk_id;
56 // int32 received_time; // From base::Time::ToTimeT().
57 // char[32] full_hash;
58 // }
59 // array[sub_hash_count] {
60 // int32 chunk_id;
61 // int32 add_chunk_id;
62 // char[32] add_full_hash;
63 // }
64 // }
65 // MD5Digest checksum; // Checksum over entire file.
67 // The checksums are used to allow writing the file without doing an expensive
68 // fsync(). Since the data can be re-fetched, failing the checksum is not
69 // catastrophic. Histograms indicate that file corruption here is pretty
70 // uncommon.
72 // The |header_checksum| is present to guarantee valid header and chunk data for
73 // updates. Only that part of the file needs to be read to post the update.
75 // |shard_stride| breaks the file into approximately-equal portions, allowing
76 // updates to stream from one file to another with modest memory usage. It is
77 // dynamic to adjust to different file sizes without adding excessive overhead.
79 // During the course of an update, uncommitted data is stored in a
80 // temporary file (which is later re-used to commit). This is an
81 // array of chunks, with the count kept in memory until the end of the
82 // transaction. The format of this file is like the main file, with
83 // the list of chunks seen omitted, as that data is tracked in-memory:
85 // array[] {
86 // uint32 add_prefix_count;
87 // uint32 sub_prefix_count;
88 // uint32 add_hash_count;
89 // uint32 sub_hash_count;
90 // array[add_prefix_count] {
91 // int32 chunk_id;
92 // uint32 prefix;
93 // }
94 // array[sub_prefix_count] {
95 // int32 chunk_id;
96 // int32 add_chunk_id;
97 // uint32 add_prefix;
98 // }
99 // array[add_hash_count] {
100 // int32 chunk_id;
101 // int32 received_time; // From base::Time::ToTimeT().
102 // char[32] full_hash;
103 // }
104 // array[sub_hash_count] {
105 // int32 chunk_id;
106 // int32 add_chunk_id;
107 // char[32] add_full_hash;
108 // }
109 // }
111 // The overall transaction works like this:
112 // - Open the original file to get the chunks-seen data.
113 // - Open a temp file for storing new chunk info.
114 // - Write new chunks to the temp file.
115 // - When the transaction is finished:
116 // - Read the update data from the temp file into memory.
117 // - Overwrite the temp file with new header data.
118 // - Until done:
119 // - Read shards of the original file's data into memory.
120 // - Merge from the update data.
121 // - Write shards to the temp file.
122 // - Delete original file.
123 // - Rename temp file to original filename.
125 class SafeBrowsingStoreFile : public SafeBrowsingStore {
126 public:
127 SafeBrowsingStoreFile();
128 virtual ~SafeBrowsingStoreFile();
130 virtual void Init(const base::FilePath& filename,
131 const base::Closure& corruption_callback) OVERRIDE;
133 // Delete any on-disk files, including the permanent storage.
134 virtual bool Delete() OVERRIDE;
136 // Get all add hash prefixes and full-length hashes, respectively, from
137 // the store.
138 virtual bool GetAddPrefixes(SBAddPrefixes* add_prefixes) OVERRIDE;
139 virtual bool GetAddFullHashes(
140 std::vector<SBAddFullHash>* add_full_hashes) OVERRIDE;
142 virtual bool BeginChunk() OVERRIDE;
144 virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) OVERRIDE;
145 virtual bool WriteAddHash(int32 chunk_id,
146 base::Time receive_time,
147 const SBFullHash& full_hash) OVERRIDE;
148 virtual bool WriteSubPrefix(int32 chunk_id,
149 int32 add_chunk_id, SBPrefix prefix) OVERRIDE;
150 virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
151 const SBFullHash& full_hash) OVERRIDE;
152 virtual bool FinishChunk() OVERRIDE;
154 virtual bool BeginUpdate() OVERRIDE;
155 virtual bool FinishUpdate(
156 safe_browsing::PrefixSetBuilder* builder,
157 std::vector<SBAddFullHash>* add_full_hashes_result) OVERRIDE;
158 virtual bool CancelUpdate() OVERRIDE;
160 virtual void SetAddChunk(int32 chunk_id) OVERRIDE;
161 virtual bool CheckAddChunk(int32 chunk_id) OVERRIDE;
162 virtual void GetAddChunks(std::vector<int32>* out) OVERRIDE;
163 virtual void SetSubChunk(int32 chunk_id) OVERRIDE;
164 virtual bool CheckSubChunk(int32 chunk_id) OVERRIDE;
165 virtual void GetSubChunks(std::vector<int32>* out) OVERRIDE;
167 virtual void DeleteAddChunk(int32 chunk_id) OVERRIDE;
168 virtual void DeleteSubChunk(int32 chunk_id) OVERRIDE;
170 // Verify |file_|'s checksum, calling the corruption callback if it
171 // does not check out. Empty input is considered valid.
172 virtual bool CheckValidity() OVERRIDE;
174 // Returns the name of the temporary file used to buffer data for
175 // |filename|. Exported for unit tests.
176 static const base::FilePath TemporaryFileForFilename(
177 const base::FilePath& filename) {
178 return base::FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
181 // Delete any on-disk files, including the permanent storage.
182 static bool DeleteStore(const base::FilePath& basename);
184 private:
185 // Does the actual update for FinishUpdate(), so that FinishUpdate() can clean
186 // up correctly in case of error.
187 virtual bool DoUpdate(safe_browsing::PrefixSetBuilder* builder,
188 std::vector<SBAddFullHash>* add_full_hashes_result);
190 // Some very lucky users have an original-format file still in their
191 // profile. Check for it and delete, recording a histogram for the
192 // result (no histogram for not-found). Logically this
193 // would make more sense at the SafeBrowsingDatabase level, but
194 // practically speaking that code doesn't touch files directly.
195 static void CheckForOriginalAndDelete(const base::FilePath& filename);
197 // Close all files and clear all buffers.
198 bool Close();
200 // Calls |corruption_callback_| if non-NULL, always returns false as
201 // a convenience to the caller.
202 bool OnCorruptDatabase();
204 // Helper for creating a corruption callback for |old_store_|.
205 // TODO(shess): Remove after migration.
206 void HandleCorruptDatabase();
208 // Clear temporary buffers used to accumulate chunk data.
209 bool ClearChunkBuffers() {
210 // NOTE: .clear() doesn't release memory.
211 // TODO(shess): Figure out if this is overkill. Some amount of
212 // pre-reserved space is probably reasonable between each chunk
213 // collected.
214 SBAddPrefixes().swap(add_prefixes_);
215 SBSubPrefixes().swap(sub_prefixes_);
216 std::vector<SBAddFullHash>().swap(add_hashes_);
217 std::vector<SBSubFullHash>().swap(sub_hashes_);
218 return true;
221 // Clear all buffers used during update.
222 void ClearUpdateBuffers() {
223 ClearChunkBuffers();
224 chunks_written_ = 0;
225 std::set<int32>().swap(add_chunks_cache_);
226 std::set<int32>().swap(sub_chunks_cache_);
227 base::hash_set<int32>().swap(add_del_cache_);
228 base::hash_set<int32>().swap(sub_del_cache_);
231 // Buffers for collecting data between BeginChunk() and
232 // FinishChunk().
233 SBAddPrefixes add_prefixes_;
234 SBSubPrefixes sub_prefixes_;
235 std::vector<SBAddFullHash> add_hashes_;
236 std::vector<SBSubFullHash> sub_hashes_;
238 // Count of chunks collected in |new_file_|.
239 int chunks_written_;
241 // Name of the main database file.
242 base::FilePath filename_;
244 // Handles to the main and scratch files. |empty_| is true if the
245 // main file didn't exist when the update was started.
246 base::ScopedFILE file_;
247 base::ScopedFILE new_file_;
248 bool empty_;
250 // Cache of chunks which have been seen. Loaded from the database
251 // on BeginUpdate() so that it can be queried during the
252 // transaction.
253 std::set<int32> add_chunks_cache_;
254 std::set<int32> sub_chunks_cache_;
256 // Cache the set of deleted chunks during a transaction, applied on
257 // FinishUpdate().
258 // TODO(shess): If the set is small enough, hash_set<> might be
259 // slower than plain set<>.
260 base::hash_set<int32> add_del_cache_;
261 base::hash_set<int32> sub_del_cache_;
263 base::Closure corruption_callback_;
265 // Tracks whether corruption has already been seen in the current
266 // update, so that only one instance is recorded in the stats.
267 // TODO(shess): Remove with format-migration support.
268 bool corruption_seen_;
270 DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
273 #endif // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_