1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/prefix_set.h"
9 #include "base/file_util.h"
10 #include "base/files/scoped_file.h"
11 #include "base/logging.h"
13 #include "base/metrics/histogram.h"
14 #include "base/metrics/sparse_histogram.h"
18 // |kMagic| should be reasonably unique, and not match itself across
19 // endianness changes. I generated this value with:
20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
21 static uint32 kMagic
= 0x864088dd;
24 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
25 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
26 // Version 3: ????????/r?????? by shess@chromium.org on 2014-04-??
28 // Version 2 layout is identical to version 1. The sort order of |index_|
29 // changed from |int32| to |uint32| to match the change of |SBPrefix|.
30 // Version 3 adds storage for full hashes.
31 static uint32 kVersion
= 3;
32 static uint32 kDeprecatedVersion
= 1; // And lower.
46 uint32 full_hashes_size
;
49 // Common std::vector<> implementations add capacity by multiplying from the
50 // current size (usually either by 2 or 1.5) to satisfy push_back() running in
51 // amortized constant time. This is not necessary for insert() at end(), but
52 // AFAICT it seems true for some implementations. SBPrefix values should
53 // uniformly cover the 32-bit space, so the final size can be estimated given a
54 // subset of the input.
56 // |kEstimateThreshold| is when estimates start converging. Results are strong
57 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of
58 // resizing capacity from >50% to 100%.
60 // TODO(shess): I'm sure there is math in the world to describe good settings
61 // for estimating the size of a uniformly-distributed set of integers from a
62 // sorted subset. I do not have such math in me, so I assumed that my current
63 // organic database of prefixes was scale-free, and wrote a script to see how
64 // often given slop values would always suffice for given strides. At 1<<30,
65 // .5% slop was sufficient to cover all cases (though the code below uses 1%).
67 // TODO(shess): A smaller threshold uses less transient space in reallocation.
68 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost
69 // is that a smaller threshold needs more slop (locked down for the long term).
70 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.
71 const SBPrefix kEstimateThreshold
= 1 << 30;
72 size_t EstimateFinalCount(SBPrefix current_prefix
, size_t current_count
) {
73 // estimated_count / current_count == estimated_max / current_prefix
74 // For large input sets, estimated_max of 2^32 is close enough.
75 const size_t estimated_prefix_count
= static_cast<size_t>(
76 (static_cast<uint64
>(current_count
) << 32) / current_prefix
);
78 // The estimate has an error bar, if the final total is below the estimate, no
79 // harm, but if it is above a capacity resize will happen at nearly 100%. Add
80 // some slop to make sure all cases are covered.
81 return estimated_prefix_count
+ estimated_prefix_count
/ 100;
86 namespace safe_browsing
{
88 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
90 bool PrefixSet::PrefixLess(const IndexPair
& a
, const IndexPair
& b
) {
91 return a
.first
< b
.first
;
94 PrefixSet::PrefixSet() {
97 PrefixSet::PrefixSet(IndexVector
* index
,
98 std::vector
<uint16
>* deltas
,
99 std::vector
<SBFullHash
>* full_hashes
) {
100 DCHECK(index
&& deltas
&& full_hashes
);
102 deltas_
.swap(*deltas
);
103 full_hashes_
.swap(*full_hashes
);
106 PrefixSet::~PrefixSet() {}
108 bool PrefixSet::PrefixExists(SBPrefix prefix
) const {
112 // Find the first position after |prefix| in |index_|.
113 IndexVector::const_iterator iter
=
114 std::upper_bound(index_
.begin(), index_
.end(),
115 IndexPair(prefix
, 0), PrefixLess
);
117 // |prefix| comes before anything that's in the set.
118 if (iter
== index_
.begin())
121 // Capture the upper bound of our target entry's deltas.
122 const size_t bound
= (iter
== index_
.end() ? deltas_
.size() : iter
->second
);
124 // Back up to the entry our target is in.
127 // All prefixes in |index_| are in the set.
128 SBPrefix current
= iter
->first
;
129 if (current
== prefix
)
132 // Scan forward accumulating deltas while a match is possible.
133 for (size_t di
= iter
->second
; di
< bound
&& current
< prefix
; ++di
) {
134 current
+= deltas_
[di
];
137 return current
== prefix
;
140 bool PrefixSet::Exists(const SBFullHash
& hash
) const {
141 if (std::binary_search(full_hashes_
.begin(), full_hashes_
.end(),
142 hash
, SBFullHashLess
)) {
145 return PrefixExists(hash
.prefix
);
148 void PrefixSet::GetPrefixes(std::vector
<SBPrefix
>* prefixes
) const {
149 prefixes
->reserve(index_
.size() + deltas_
.size());
151 for (size_t ii
= 0; ii
< index_
.size(); ++ii
) {
152 // The deltas for this |index_| entry run to the next index entry,
153 // or the end of the deltas.
154 const size_t deltas_end
=
155 (ii
+ 1 < index_
.size()) ? index_
[ii
+ 1].second
: deltas_
.size();
157 SBPrefix current
= index_
[ii
].first
;
158 prefixes
->push_back(current
);
159 for (size_t di
= index_
[ii
].second
; di
< deltas_end
; ++di
) {
160 current
+= deltas_
[di
];
161 prefixes
->push_back(current
);
167 scoped_ptr
<PrefixSet
> PrefixSet::LoadFile(const base::FilePath
& filter_name
) {
169 if (!base::GetFileSize(filter_name
, &size_64
))
170 return scoped_ptr
<PrefixSet
>();
171 using base::MD5Digest
;
172 // TODO(shess): Revert to sizeof(FileHeader) for sanity check once v2 is
174 if (size_64
< static_cast<int64
>(sizeof(FileHeader_v2
) + sizeof(MD5Digest
)))
175 return scoped_ptr
<PrefixSet
>();
177 base::ScopedFILE
file(base::OpenFile(filter_name
, "rb"));
179 return scoped_ptr
<PrefixSet
>();
182 size_t read
= fread(&header
, sizeof(header
), 1, file
.get());
184 return scoped_ptr
<PrefixSet
>();
186 // The file looks valid, start building the digest.
187 base::MD5Context context
;
188 base::MD5Init(&context
);
189 base::MD5Update(&context
, base::StringPiece(reinterpret_cast<char*>(&header
),
192 if (header
.magic
!= kMagic
)
193 return scoped_ptr
<PrefixSet
>();
195 // Track version read to inform removal of support for older versions.
196 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header
.version
);
198 // TODO(shess): <http://crbug.com/368044> for removing v2 support.
199 size_t header_size
= sizeof(header
);
200 if (header
.version
<= kDeprecatedVersion
) {
201 return scoped_ptr
<PrefixSet
>();
202 } else if (header
.version
== 2) {
203 // Rewind the file and restart building the digest with the old header
205 FileHeader_v2 v2_header
;
206 if (0 != fseek(file
.get(), 0, SEEK_SET
))
207 return scoped_ptr
<PrefixSet
>();
209 size_t read
= fread(&v2_header
, sizeof(v2_header
), 1, file
.get());
211 return scoped_ptr
<PrefixSet
>();
213 base::MD5Init(&context
);
214 base::MD5Update(&context
,
215 base::StringPiece(reinterpret_cast<char*>(&v2_header
),
218 // The current header is a superset of the old header, fill it in with the
220 header
.magic
= v2_header
.magic
;
221 header
.version
= v2_header
.version
;
222 header
.index_size
= v2_header
.index_size
;
223 header
.deltas_size
= v2_header
.deltas_size
;
224 header
.full_hashes_size
= 0;
225 header_size
= sizeof(v2_header
);
226 } else if (header
.version
!= kVersion
) {
227 return scoped_ptr
<PrefixSet
>();
231 const size_t index_bytes
= sizeof(index
[0]) * header
.index_size
;
233 std::vector
<uint16
> deltas
;
234 const size_t deltas_bytes
= sizeof(deltas
[0]) * header
.deltas_size
;
236 std::vector
<SBFullHash
> full_hashes
;
237 const size_t full_hashes_bytes
=
238 sizeof(full_hashes
[0]) * header
.full_hashes_size
;
240 // Check for bogus sizes before allocating any space.
241 const size_t expected_bytes
= header_size
+
242 index_bytes
+ deltas_bytes
+ full_hashes_bytes
+ sizeof(MD5Digest
);
243 if (static_cast<int64
>(expected_bytes
) != size_64
)
244 return scoped_ptr
<PrefixSet
>();
246 // Read the index vector. Herb Sutter indicates that vectors are
247 // guaranteed to be contiuguous, so reading to where element 0 lives
249 if (header
.index_size
) {
250 index
.resize(header
.index_size
);
251 read
= fread(&(index
[0]), sizeof(index
[0]), index
.size(), file
.get());
252 if (read
!= index
.size())
253 return scoped_ptr
<PrefixSet
>();
254 base::MD5Update(&context
,
255 base::StringPiece(reinterpret_cast<char*>(&(index
[0])),
259 // Read vector of deltas.
260 if (header
.deltas_size
) {
261 deltas
.resize(header
.deltas_size
);
262 read
= fread(&(deltas
[0]), sizeof(deltas
[0]), deltas
.size(), file
.get());
263 if (read
!= deltas
.size())
264 return scoped_ptr
<PrefixSet
>();
265 base::MD5Update(&context
,
266 base::StringPiece(reinterpret_cast<char*>(&(deltas
[0])),
270 // Read vector of full hashes.
271 if (header
.full_hashes_size
) {
272 full_hashes
.resize(header
.full_hashes_size
);
273 read
= fread(&(full_hashes
[0]), sizeof(full_hashes
[0]), full_hashes
.size(),
275 if (read
!= full_hashes
.size())
276 return scoped_ptr
<PrefixSet
>();
277 base::MD5Update(&context
,
279 reinterpret_cast<char*>(&(full_hashes
[0])),
283 base::MD5Digest calculated_digest
;
284 base::MD5Final(&calculated_digest
, &context
);
286 base::MD5Digest file_digest
;
287 read
= fread(&file_digest
, sizeof(file_digest
), 1, file
.get());
289 return scoped_ptr
<PrefixSet
>();
291 if (0 != memcmp(&file_digest
, &calculated_digest
, sizeof(file_digest
)))
292 return scoped_ptr
<PrefixSet
>();
294 // Steals vector contents using swap().
295 return scoped_ptr
<PrefixSet
>(new PrefixSet(&index
, &deltas
, &full_hashes
));
298 bool PrefixSet::WriteFile(const base::FilePath
& filter_name
) const {
300 header
.magic
= kMagic
;
301 header
.version
= kVersion
;
302 header
.index_size
= static_cast<uint32
>(index_
.size());
303 header
.deltas_size
= static_cast<uint32
>(deltas_
.size());
304 header
.full_hashes_size
= static_cast<uint32
>(full_hashes_
.size());
306 // Sanity check that the 32-bit values never mess things up.
307 if (static_cast<size_t>(header
.index_size
) != index_
.size() ||
308 static_cast<size_t>(header
.deltas_size
) != deltas_
.size() ||
309 static_cast<size_t>(header
.full_hashes_size
) != full_hashes_
.size()) {
314 base::ScopedFILE
file(base::OpenFile(filter_name
, "wb"));
318 base::MD5Context context
;
319 base::MD5Init(&context
);
321 // TODO(shess): The I/O code in safe_browsing_store_file.cc would
322 // sure be useful about now.
323 size_t written
= fwrite(&header
, sizeof(header
), 1, file
.get());
326 base::MD5Update(&context
, base::StringPiece(reinterpret_cast<char*>(&header
),
329 // As for reads, the standard guarantees the ability to access the
330 // contents of the vector by a pointer to an element.
332 const size_t index_bytes
= sizeof(index_
[0]) * index_
.size();
333 written
= fwrite(&(index_
[0]), sizeof(index_
[0]), index_
.size(),
335 if (written
!= index_
.size())
337 base::MD5Update(&context
,
339 reinterpret_cast<const char*>(&(index_
[0])),
343 if (deltas_
.size()) {
344 const size_t deltas_bytes
= sizeof(deltas_
[0]) * deltas_
.size();
345 written
= fwrite(&(deltas_
[0]), sizeof(deltas_
[0]), deltas_
.size(),
347 if (written
!= deltas_
.size())
349 base::MD5Update(&context
,
351 reinterpret_cast<const char*>(&(deltas_
[0])),
355 if (full_hashes_
.size()) {
356 const size_t elt_size
= sizeof(full_hashes_
[0]);
357 const size_t elts
= full_hashes_
.size();
358 const size_t full_hashes_bytes
= elt_size
* elts
;
359 written
= fwrite(&(full_hashes_
[0]), elt_size
, elts
, file
.get());
362 base::MD5Update(&context
,
364 reinterpret_cast<const char*>(&(full_hashes_
[0])),
368 base::MD5Digest digest
;
369 base::MD5Final(&digest
, &context
);
370 written
= fwrite(&digest
, sizeof(digest
), 1, file
.get());
374 // TODO(shess): Can this code check that the close was successful?
380 void PrefixSet::AddRun(SBPrefix index_prefix
,
381 const uint16
* run_begin
, const uint16
* run_end
) {
382 // Preempt organic capacity decisions for |delta_| once strong estimates can
384 if (index_prefix
> kEstimateThreshold
&&
385 deltas_
.capacity() < deltas_
.size() + (run_end
- run_begin
)) {
386 deltas_
.reserve(EstimateFinalCount(index_prefix
, deltas_
.size()));
389 index_
.push_back(std::make_pair(index_prefix
, deltas_
.size()));
390 deltas_
.insert(deltas_
.end(), run_begin
, run_end
);
393 PrefixSetBuilder::PrefixSetBuilder()
394 : prefix_set_(new PrefixSet()) {
397 PrefixSetBuilder::PrefixSetBuilder(const std::vector
<SBPrefix
>& prefixes
)
398 : prefix_set_(new PrefixSet()) {
399 for (size_t i
= 0; i
< prefixes
.size(); ++i
) {
400 AddPrefix(prefixes
[i
]);
404 PrefixSetBuilder::~PrefixSetBuilder() {
407 scoped_ptr
<PrefixSet
> PrefixSetBuilder::GetPrefixSet(
408 const std::vector
<SBFullHash
>& hashes
) {
409 DCHECK(prefix_set_
.get());
411 // Flush runs until buffered data is gone.
412 while (!buffer_
.empty()) {
416 // Precisely size |index_| for read-only. It's 50k-60k, so minor savings, but
417 // they're almost free.
418 PrefixSet::IndexVector(prefix_set_
->index_
).swap(prefix_set_
->index_
);
420 prefix_set_
->full_hashes_
= hashes
;
421 std::sort(prefix_set_
->full_hashes_
.begin(), prefix_set_
->full_hashes_
.end(),
424 return prefix_set_
.Pass();
427 scoped_ptr
<PrefixSet
> PrefixSetBuilder::GetPrefixSetNoHashes() {
428 return GetPrefixSet(std::vector
<SBFullHash
>()).Pass();
431 void PrefixSetBuilder::EmitRun() {
432 DCHECK(prefix_set_
.get());
434 SBPrefix prev_prefix
= buffer_
[0];
435 uint16 run
[PrefixSet::kMaxRun
];
439 for (i
= 1; i
< buffer_
.size() && run_pos
< PrefixSet::kMaxRun
; ++i
) {
440 // Calculate the delta. |unsigned| is mandatory, because the
441 // sorted_prefixes could be more than INT_MAX apart.
442 DCHECK_GT(buffer_
[i
], prev_prefix
);
443 const unsigned delta
= buffer_
[i
] - prev_prefix
;
444 const uint16 delta16
= static_cast<uint16
>(delta
);
446 // Break the run if the delta doesn't fit.
447 if (delta
!= static_cast<unsigned>(delta16
))
450 // Continue the run of deltas.
451 run
[run_pos
++] = delta16
;
452 DCHECK_EQ(static_cast<unsigned>(run
[run_pos
- 1]), delta
);
454 prev_prefix
= buffer_
[i
];
456 prefix_set_
->AddRun(buffer_
[0], run
, run
+ run_pos
);
457 buffer_
.erase(buffer_
.begin(), buffer_
.begin() + i
);
460 void PrefixSetBuilder::AddPrefix(SBPrefix prefix
) {
461 DCHECK(prefix_set_
.get());
463 if (buffer_
.empty()) {
464 DCHECK(prefix_set_
->index_
.empty());
465 DCHECK(prefix_set_
->deltas_
.empty());
468 if (buffer_
.back() == prefix
)
471 DCHECK_LT(buffer_
.back(), prefix
);
473 buffer_
.push_back(prefix
);
475 // Flush buffer when a run can be constructed. +1 for the index item, and +1
476 // to leave at least one item in the buffer for dropping duplicates.
477 if (buffer_
.size() > PrefixSet::kMaxRun
+ 2)
481 } // namespace safe_browsing