1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // We want to measure the similarity of two sequences of bytes as a surrogate
6 // for measuring how well a second sequence will compress differentially to the
9 #include "courgette/difference_estimator.h"
11 #include "base/containers/hash_tables.h"
15 // Our difference measure is the number of k-tuples that occur in Subject that
16 // don't occur in Base.
17 const int kTupleSize
= 4;
21 static_assert(kTupleSize
>= 4 && kTupleSize
<= 8,
22 "kTupleSize should be between 4 and 8");
24 size_t HashTuple(const uint8
* source
) {
25 size_t hash1
= *reinterpret_cast<const uint32
*>(source
);
26 size_t hash2
= *reinterpret_cast<const uint32
*>(source
+ kTupleSize
- 4);
27 size_t hash
= ((hash1
* 17 + hash2
* 37) + (hash1
>> 17)) ^ (hash2
>> 23);
31 bool RegionsEqual(const Region
& a
, const Region
& b
) {
32 if (a
.length() != b
.length())
34 return memcmp(a
.start(), b
.start(), a
.length()) == 0;
37 } // anonymous namepace
39 class DifferenceEstimator::Base
{
41 explicit Base(const Region
& region
) : region_(region
) { }
44 if (region_
.length() < kTupleSize
)
46 const uint8
* start
= region_
.start();
47 const uint8
* end
= region_
.end() - (kTupleSize
- 1);
48 for (const uint8
* p
= start
; p
< end
; ++p
) {
49 size_t hash
= HashTuple(p
);
54 const Region
& region() const { return region_
; }
58 base::hash_set
<size_t> hashes_
;
60 friend class DifferenceEstimator
;
61 DISALLOW_COPY_AND_ASSIGN(Base
);
64 class DifferenceEstimator::Subject
{
66 explicit Subject(const Region
& region
) : region_(region
) {}
68 const Region
& region() const { return region_
; }
73 DISALLOW_COPY_AND_ASSIGN(Subject
);
76 DifferenceEstimator::DifferenceEstimator() {}
78 DifferenceEstimator::~DifferenceEstimator() {
79 for (size_t i
= 0; i
< owned_bases_
.size(); ++i
)
80 delete owned_bases_
[i
];
81 for (size_t i
= 0; i
< owned_subjects_
.size(); ++i
)
82 delete owned_subjects_
[i
];
85 DifferenceEstimator::Base
* DifferenceEstimator::MakeBase(const Region
& region
) {
86 Base
* base
= new Base(region
);
88 owned_bases_
.push_back(base
);
92 DifferenceEstimator::Subject
* DifferenceEstimator::MakeSubject(
93 const Region
& region
) {
94 Subject
* subject
= new Subject(region
);
95 owned_subjects_
.push_back(subject
);
99 size_t DifferenceEstimator::Measure(Base
* base
, Subject
* subject
) {
100 size_t mismatches
= 0;
101 if (subject
->region().length() >= kTupleSize
) {
102 const uint8
* start
= subject
->region().start();
103 const uint8
* end
= subject
->region().end() - (kTupleSize
- 1);
105 const uint8
* p
= start
;
107 size_t hash
= HashTuple(p
);
108 if (base
->hashes_
.find(hash
) == base
->hashes_
.end()) {
115 if (mismatches
== 0) {
116 if (RegionsEqual(base
->region(), subject
->region()))
119 ++mismatches
; // Guarantee not zero.