1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "courgette/adjustment_method.h"
15 #include "base/basictypes.h"
16 #include "base/format_macros.h"
17 #include "base/logging.h"
18 #include "base/stringprintf.h"
19 #include "base/time.h"
20 #include "courgette/assembly_program.h"
21 #include "courgette/courgette.h"
22 #include "courgette/encoded_program.h"
23 #include "courgette/image_info.h"
27 Shingle weighting matching.
29 We have a sequence S1 of symbols from alphabet A1={A,B,C,...} called the 'model'
30 and a second sequence of S2 of symbols from alphabet A2={U,V,W,....} called the
31 'program'. Each symbol in A1 has a unique numerical name or index. We can
32 transcribe the sequence S1 to a sequence T1 of indexes of the symbols. We wish
33 to assign indexes to the symbols in A2 so that when we transcribe S2 into T2, T2
34 has long subsequences that occur in T1. This will ensure that the sequence
35 T1;T2 compresses to be only slightly larger than the compressed T1.
37 The algorithm for matching members of S2 with members of S1 is eager - it makes
38 matches without backtracking, until no more matches can be made. Each variable
39 (symbol) U,V,... in A2 has a set of candidates from A1, each candidate with a
40 weight summarizing the evidence for the match. We keep a VariableQueue of
41 U,V,... sorted by how much the evidence for the best choice outweighs the
42 evidence for the second choice, i.e. prioritized by how 'clear cut' the best
43 assignment is. We pick the variable with the most clear-cut candidate, make the
44 assignment, adjust the evidence and repeat.
46 What has not been described so far is how the evidence is gathered and
47 maintained. We are working under the assumption that S1 and S2 are largely
48 similar. (A different assumption might be that S1 and S2 are dissimilar except
49 for many long subsequences.)
51 A naive algorithm would consider all pairs (A,U) and for each pair assess the
52 benefit, or score, the assignment U:=A. The score might count the number of
53 occurrences of U in S2 which appear in similar contexts to A in S1.
55 To distinguish contexts we view S1 and S2 as a sequence of overlapping k-length
56 substrings or 'shingles'. Two shingles are compatible if the symbols in one
57 shingle could be matched with the symbols in the other symbol. For example, ABC
58 is *not* compatible with UVU because it would require conflicting matches A=U
59 and C=U. ABC is compatible with UVW, UWV, WUV, VUW etc. We can't tell which
60 until we make an assignment - the compatible shingles form an equivalence class.
61 After assigning U:=A then only UVW and UWV (equivalently AVW, AWV) are
62 compatible. As we make assignments the number of equivalence classes of
63 shingles increases and the number of members of each equivalence class
64 decreases. The compatibility test becomes more restrictive.
66 We gather evidence for the potential assignment U:=A by counting how many
67 shingles containing U are compatible with shingles containing A. Thus symbols
68 occurring a large number of times in compatible contexts will be assigned first.
70 Finding the 'most clear-cut' assignment by considering all pairs symbols and for
71 each pair comparing the contexts of each pair of occurrences of the symbols is
72 computationally infeasible. We get the job done in a reasonable time by
73 approaching it 'backwards' and making incremental changes as we make
76 First the shingles are partitioned according to compatibility. In S1=ABCDD and
77 S2=UVWXX we have a total of 6 shingles, each occuring once. (ABC:1 BCD:1 CDD:1;
78 UVW:1 VWX: WXX:1) all fit the pattern <V0 V1 V2> or the pattern <V0 V1 V1>. The
79 first pattern indicates that each position matches a different symbol, the
80 second pattern indicates that the second symbol is repeated.
82 pattern S1 members S2 members
83 <V0 V1 V2>: {ABC:1, BCD:1}; {UVW:1, VWX:1}
84 <V0 V1 V1>: {CDD:1} {WXX:1}
86 The second pattern appears to have a unique assignment but we don't make the
87 assignment on such scant evidence. If S1 and S2 do not match exactly, there
88 will be numerous spurious low-score matches like this. Instead we must see what
89 assignments are indicated by considering all of the evidence.
91 First pattern has 2 x 2 = 4 shingle pairs. For each pair we count the number
92 of symbol assignments. For ABC:a * UVW:b accumulate min(a,b) to each of
94 After accumulating over all 2 x 2 pairs:
99 The second pattern contributes:
108 From this we decide to assign X:=D (because this assignment has both the largest
109 difference above the next candidate (X:=C) and this is also the largest
110 proportionately over the sum of alternatives).
112 Lets assume D has numerical 'name' 77. The assignment X:=D sets X to 77 too.
113 Next we repartition all the shingles containing X or D:
115 pattern S1 members S2 members
116 <V0 V1 V2>: {ABC:1}; {UVW:1}
117 <V0 V1 77>: {BCD:1}; {VWX:1}
118 <V0 77 77>: {CDD:1} {WXX:1}
119 As we repartition, we recalculate the contributions to the scores:
123 All the remaining assignments are now fixed.
125 There is one step in the incremental algorithm that is still infeasibly
126 expensive: the contributions due to the cross product of large equivalence
127 classes. We settle for making an approximation by computing the contribution of
128 the cross product of only the most common shingles. The hope is that the noise
129 from the long tail of uncounted shingles is well below the scores being used to
130 pick assignments. The second hope is that as assignment are made, the large
131 equivalence class will be partitioned into smaller equivalence classes, reducing
134 In the code below the shingles are bigger (Shingle::kWidth = 5).
135 Class ShinglePattern holds the data for one pattern.
137 There is an optimization for this case:
138 <V0 V1 V1>: {CDD:1} {WXX:1}
140 Above we said that we don't make an assignment on this "scant evidence". There
141 is an exception: if there is only one variable unassigned (more like the <V0 77
142 77> pattern) AND there are no occurrences of C and W other than those counted in
143 this pattern, then there is no competing evidence and we go ahead with the
144 assignment immediately. This produces slightly better results because these
145 cases tend to be low-scoring and susceptible to small mistakes made in
146 low-scoring assignments in the approximation for large equivalence classes.
150 namespace courgette
{
151 namespace adjustment_method_2
{
153 ////////////////////////////////////////////////////////////////////////////////
155 class AssignmentCandidates
;
156 class LabelInfoMaker
;
158 class ShinglePattern
;
160 // The purpose of adjustment is to assign indexes to Labels of a program 'p' to
161 // make the sequence of indexes similar to a 'model' program 'm'. Labels
162 // themselves don't have enough information to do this job, so we work with a
163 // LabelInfo surrogate for each label.
167 // Just a no-argument constructor and copy constructor. Actual LabelInfo
168 // objects are allocated in std::pair structs in a std::map.
170 : label_(NULL
), is_model_(false), debug_index_(0), refs_(0),
171 assignment_(NULL
), candidates_(NULL
)
176 AssignmentCandidates
* candidates();
178 Label
* label_
; // The label that this info a surrogate for.
180 uint32 is_model_
: 1; // Is the label in the model?
181 uint32 debug_index_
: 31; // A small number for naming the label in debug
182 // output. The pair (is_model_, debug_index_) is
185 int refs_
; // Number of times this Label is referenced.
187 LabelInfo
* assignment_
; // Label from other program corresponding to this.
189 std::vector
<uint32
> positions_
; // Offsets into the trace of references.
192 AssignmentCandidates
* candidates_
;
194 void operator=(const LabelInfo
*); // Disallow assignment only.
195 // Public compiler generated copy constructor is needed to constuct
196 // std::pair<Label*, LabelInfo> so that fresh LabelInfos can be allocated
197 // inside a std::map.
200 typedef std::vector
<LabelInfo
*> Trace
;
202 std::string
ToString(const LabelInfo
* info
) {
204 base::StringAppendF(&s
, "%c%d", "pm"[info
->is_model_
], info
->debug_index_
);
205 if (info
->label_
->index_
!= Label::kNoIndex
)
206 base::StringAppendF(&s
, " (%d)", info
->label_
->index_
);
208 base::StringAppendF(&s
, " #%u", info
->refs_
);
212 // LabelInfoMaker maps labels to their surrogate LabelInfo objects.
213 class LabelInfoMaker
{
215 LabelInfoMaker() : debug_label_index_gen_(0) {}
217 LabelInfo
* MakeLabelInfo(Label
* label
, bool is_model
, uint32 position
) {
218 LabelInfo
& slot
= label_infos_
[label
];
219 if (slot
.label_
== NULL
) {
221 slot
.is_model_
= is_model
;
222 slot
.debug_index_
= ++debug_label_index_gen_
;
224 slot
.positions_
.push_back(position
);
229 void ResetDebugLabel() { debug_label_index_gen_
= 0; }
232 int debug_label_index_gen_
;
234 // Note LabelInfo is allocated 'flat' inside map::value_type, so the LabelInfo
235 // lifetimes are managed by the map.
236 std::map
<Label
*, LabelInfo
> label_infos_
;
238 DISALLOW_COPY_AND_ASSIGN(LabelInfoMaker
);
241 struct OrderLabelInfo
{
242 bool operator()(const LabelInfo
* a
, const LabelInfo
* b
) const {
243 if (a
->label_
->rva_
< b
->label_
->rva_
) return true;
244 if (a
->label_
->rva_
> b
->label_
->rva_
) return false;
245 if (a
== b
) return false;
246 return a
->positions_
< b
->positions_
; // Lexicographic ordering of vector.
250 // AssignmentCandidates is a priority queue of candidate assignments to
251 // a single program LabelInfo, |program_info_|.
252 class AssignmentCandidates
{
254 explicit AssignmentCandidates(LabelInfo
* program_info
)
255 : program_info_(program_info
) {}
257 LabelInfo
* program_info() const { return program_info_
; }
259 bool empty() const { return label_to_score_
.empty(); }
261 LabelInfo
* top_candidate() const { return queue_
.begin()->second
; }
263 void Update(LabelInfo
* model_info
, int delta_score
) {
264 LOG_ASSERT(delta_score
!= 0);
267 LabelToScore::iterator p
= label_to_score_
.find(model_info
);
268 if (p
!= label_to_score_
.end()) {
269 old_score
= p
->second
;
270 new_score
= old_score
+ delta_score
;
271 queue_
.erase(ScoreAndLabel(old_score
, p
->first
));
272 if (new_score
== 0) {
273 label_to_score_
.erase(p
);
275 p
->second
= new_score
;
276 queue_
.insert(ScoreAndLabel(new_score
, model_info
));
279 new_score
= delta_score
;
280 label_to_score_
.insert(std::make_pair(model_info
, new_score
));
281 queue_
.insert(ScoreAndLabel(new_score
, model_info
));
283 LOG_ASSERT(queue_
.size() == label_to_score_
.size());
286 int TopScore() const {
288 int second_value
= 0;
289 Queue::const_iterator p
= queue_
.begin();
290 if (p
!= queue_
.end()) {
291 first_value
= p
->first
;
293 if (p
!= queue_
.end()) {
294 second_value
= p
->first
;
297 return first_value
- second_value
;
300 bool HasPendingUpdates() { return !pending_updates_
.empty(); }
302 void AddPendingUpdate(LabelInfo
* model_info
, int delta_score
) {
303 LOG_ASSERT(delta_score
!= 0);
304 pending_updates_
[model_info
] += delta_score
;
307 void ApplyPendingUpdates() {
308 // TODO(sra): try to walk |pending_updates_| and |label_to_score_| in
309 // lockstep. Try to batch updates to |queue_|.
311 for (LabelToScore::iterator p
= pending_updates_
.begin();
312 p
!= pending_updates_
.end();
315 Update(p
->first
, p
->second
);
319 pending_updates_
.clear();
322 void Print(int max
) {
323 VLOG(2) << "score " << TopScore() << " " << ToString(program_info_
)
325 if (!pending_updates_
.empty())
326 VLOG(2) << pending_updates_
.size() << " pending";
328 for (Queue::iterator q
= queue_
.begin(); q
!= queue_
.end(); ++q
) {
329 if (++count
> max
) break;
330 VLOG(2) << " " << q
->first
<< " " << ToString(q
->second
);
335 typedef std::map
<LabelInfo
*, int, OrderLabelInfo
> LabelToScore
;
336 typedef std::pair
<int, LabelInfo
*> ScoreAndLabel
;
337 struct OrderScoreAndLabelByScoreDecreasing
{
338 OrderLabelInfo tie_breaker
;
339 bool operator()(const ScoreAndLabel
& a
, const ScoreAndLabel
& b
) const {
340 if (a
.first
> b
.first
) return true;
341 if (a
.first
< b
.first
) return false;
342 return tie_breaker(a
.second
, b
.second
);
345 typedef std::set
<ScoreAndLabel
, OrderScoreAndLabelByScoreDecreasing
> Queue
;
347 LabelInfo
* program_info_
;
348 LabelToScore label_to_score_
;
349 LabelToScore pending_updates_
;
353 AssignmentCandidates
* LabelInfo::candidates() {
354 if (candidates_
== NULL
)
355 candidates_
= new AssignmentCandidates(this);
359 LabelInfo::~LabelInfo() {
363 // A Shingle is a short fixed-length string of LabelInfos that actually occurs
364 // in a Trace. A Shingle may occur many times. We repesent the Shingle by the
365 // position of one of the occurrences in the Trace.
368 static const size_t kWidth
= 5;
370 struct InterningLess
{
371 bool operator()(const Shingle
& a
, const Shingle
& b
) const;
374 typedef std::set
<Shingle
, InterningLess
> OwningSet
;
376 static Shingle
* Find(const Trace
& trace
, size_t position
,
377 OwningSet
* owning_set
) {
378 std::pair
<OwningSet::iterator
, bool> pair
=
379 owning_set
->insert(Shingle(trace
, position
));
380 // pair.first iterator 'points' to the newly inserted Shingle or the
381 // previouly inserted one that looks the same according to the comparator.
383 // const_cast required because key is const. We modify the Shingle
384 // extensively but not in a way that affects InterningLess.
385 Shingle
* shingle
= const_cast<Shingle
*>(&*pair
.first
);
386 shingle
->add_position(position
);
390 LabelInfo
* at(size_t i
) const { return trace_
[exemplar_position_
+ i
]; }
391 void add_position(size_t position
) {
392 positions_
.push_back(static_cast<uint32
>(position
));
394 int position_count() const { return static_cast<int>(positions_
.size()); }
396 bool InModel() const { return at(0)->is_model_
; }
398 ShinglePattern
* pattern() const { return pattern_
; }
399 void set_pattern(ShinglePattern
* pattern
) { pattern_
= pattern
; }
402 bool operator()(const Shingle
* a
, const Shingle
* b
) const {
403 // Arbitrary but repeatable (memory-address) independent ordering:
404 return a
->exemplar_position_
< b
->exemplar_position_
;
405 // return InterningLess()(*a, *b);
410 Shingle(const Trace
& trace
, size_t exemplar_position
)
412 exemplar_position_(exemplar_position
),
416 const Trace
& trace_
; // The shingle lives inside trace_.
417 size_t exemplar_position_
; // At this position (and other positions).
418 std::vector
<uint32
> positions_
; // Includes exemplar_position_.
420 ShinglePattern
* pattern_
; // Pattern changes as LabelInfos are assigned.
422 friend std::string
ToString(const Shingle
* instance
);
424 // We can't disallow the copy constructor because we use std::set<Shingle> and
425 // VS2005's implementation of std::set<T>::set() requires T to have a copy
427 // DISALLOW_COPY_AND_ASSIGN(Shingle);
428 void operator=(const Shingle
&); // Disallow assignment only.
431 std::string
ToString(const Shingle
* instance
) {
433 const char* sep
= "<";
434 for (size_t i
= 0; i
< Shingle::kWidth
; ++i
) {
435 // base::StringAppendF(&s, "%s%x ", sep, instance.at(i)->label_->rva_);
437 s
+= ToString(instance
->at(i
));
440 base::StringAppendF(&s
, ">(%" PRIuS
")@{%d}",
441 instance
->exemplar_position_
,
442 instance
->position_count());
447 bool Shingle::InterningLess::operator()(
449 const Shingle
& b
) const {
450 for (size_t i
= 0; i
< kWidth
; ++i
) {
451 LabelInfo
* info_a
= a
.at(i
);
452 LabelInfo
* info_b
= b
.at(i
);
453 if (info_a
->label_
->rva_
< info_b
->label_
->rva_
)
455 if (info_a
->label_
->rva_
> info_b
->label_
->rva_
)
457 if (info_a
->is_model_
< info_b
->is_model_
)
459 if (info_a
->is_model_
> info_b
->is_model_
)
461 if (info_a
!= info_b
) {
468 class ShinglePattern
{
470 enum { kOffsetMask
= 7, // Offset lives in low bits.
471 kFixed
= 0, // kind & kVariable == 0 => fixed.
472 kVariable
= 8 // kind & kVariable == 1 => variable.
474 // sequence[position + (kinds_[i] & kOffsetMask)] gives LabelInfo for position
475 // i of shingle. Below, second 'A' is duplicate of position 1, second '102'
476 // is duplicate of position 0.
478 // <102, A, 103, A , 102>
479 // --> <kFixed+0, kVariable+1, kFixed+2, kVariable+1, kFixed+0>
481 explicit Index(const Shingle
* instance
);
482 uint8 kinds_
[Shingle::kWidth
];
484 uint8 unique_variables_
;
485 uint8 first_variable_index_
;
487 int assigned_indexes_
[Shingle::kWidth
];
490 // ShinglePattern keeps histograms of member Shingle instances, ordered by
491 // decreasing number of occurrences. We don't have a pair (occurrence count,
492 // Shingle instance), so we use a FreqView adapter to make the instance
493 // pointer look like the pair.
496 explicit FreqView(const Shingle
* instance
) : instance_(instance
) {}
497 int count() const { return instance_
->position_count(); }
498 const Shingle
* instance() const { return instance_
; }
500 bool operator()(const FreqView
& a
, const FreqView
& b
) const {
501 if (a
.count() > b
.count()) return true;
502 if (a
.count() < b
.count()) return false;
503 return resolve_ties(a
.instance(), b
.instance());
506 Shingle::PointerLess resolve_ties
;
509 const Shingle
* instance_
;
512 typedef std::set
<FreqView
, FreqView::Greater
> Histogram
;
514 ShinglePattern() : index_(NULL
), model_coverage_(0), program_coverage_(0) {}
516 const Index
* index_
; // Points to the key in the owning map value_type.
517 Histogram model_histogram_
;
518 Histogram program_histogram_
;
520 int program_coverage_
;
523 std::string
ToString(const ShinglePattern::Index
* index
) {
528 base::StringAppendF(&s
, "<%d: ", index
->variables_
);
529 const char* sep
= "";
530 for (size_t i
= 0; i
< Shingle::kWidth
; ++i
) {
533 uint32 kind
= index
->kinds_
[i
];
534 int offset
= kind
& ShinglePattern::kOffsetMask
;
535 if (kind
& ShinglePattern::kVariable
)
536 base::StringAppendF(&s
, "V%d", offset
);
538 base::StringAppendF(&s
, "%d", index
->assigned_indexes_
[offset
]);
540 base::StringAppendF(&s
, " %x", index
->hash_
);
546 std::string
HistogramToString(const ShinglePattern::Histogram
& histogram
,
547 size_t snippet_max
) {
549 size_t histogram_size
= histogram
.size();
550 size_t snippet_size
= 0;
551 for (ShinglePattern::Histogram::const_iterator p
= histogram
.begin();
552 p
!= histogram
.end();
554 if (++snippet_size
> snippet_max
&& snippet_size
!= histogram_size
) {
558 base::StringAppendF(&s
, " %d", p
->count());
563 std::string
HistogramToStringFull(const ShinglePattern::Histogram
& histogram
,
565 size_t snippet_max
) {
568 size_t histogram_size
= histogram
.size();
569 size_t snippet_size
= 0;
570 for (ShinglePattern::Histogram::const_iterator p
= histogram
.begin();
571 p
!= histogram
.end();
574 if (++snippet_size
> snippet_max
&& snippet_size
!= histogram_size
) {
578 base::StringAppendF(&s
, "(%d) ", p
->count());
579 s
+= ToString(&(*p
->instance()));
585 std::string
ToString(const ShinglePattern
* pattern
, size_t snippet_max
= 3) {
587 if (pattern
== NULL
) {
591 s
+= ToString(pattern
->index_
);
592 base::StringAppendF(&s
, "; %d(%d):",
593 static_cast<int>(pattern
->model_histogram_
.size()),
594 pattern
->model_coverage_
);
596 s
+= HistogramToString(pattern
->model_histogram_
, snippet_max
);
597 base::StringAppendF(&s
, "; %d(%d):",
598 static_cast<int>(pattern
->program_histogram_
.size()),
599 pattern
->program_coverage_
);
600 s
+= HistogramToString(pattern
->program_histogram_
, snippet_max
);
606 std::string
ShinglePatternToStringFull(const ShinglePattern
* pattern
,
609 s
+= ToString(pattern
->index_
);
611 size_t model_size
= pattern
->model_histogram_
.size();
612 size_t program_size
= pattern
->program_histogram_
.size();
613 base::StringAppendF(&s
, " model shingles %" PRIuS
"\n", model_size
);
614 s
+= HistogramToStringFull(pattern
->model_histogram_
, " ", max
);
615 base::StringAppendF(&s
, " program shingles %" PRIuS
"\n", program_size
);
616 s
+= HistogramToStringFull(pattern
->program_histogram_
, " ", max
);
620 struct ShinglePatternIndexLess
{
621 bool operator()(const ShinglePattern::Index
& a
,
622 const ShinglePattern::Index
& b
) const {
623 if (a
.hash_
< b
.hash_
) return true;
624 if (a
.hash_
> b
.hash_
) return false;
626 for (size_t i
= 0; i
< Shingle::kWidth
; ++i
) {
627 if (a
.kinds_
[i
] < b
.kinds_
[i
]) return true;
628 if (a
.kinds_
[i
] > b
.kinds_
[i
]) return false;
629 if ((a
.kinds_
[i
] & ShinglePattern::kVariable
) == 0) {
630 if (a
.assigned_indexes_
[i
] < b
.assigned_indexes_
[i
])
632 if (a
.assigned_indexes_
[i
] > b
.assigned_indexes_
[i
])
640 static uint32
hash_combine(uint32 h
, uint32 v
) {
642 return (h
* (37 + 0x0000d100)) ^ (h
>> 13);
645 ShinglePattern::Index::Index(const Shingle
* instance
) {
648 unique_variables_
= 0;
649 first_variable_index_
= 255;
651 for (uint32 i
= 0; i
< Shingle::kWidth
; ++i
) {
652 LabelInfo
* info
= instance
->at(i
);
656 for ( ; j
< i
; ++j
) {
657 if (info
== instance
->at(j
)) { // Duplicate LabelInfo
662 if (j
== i
) { // Not found above.
663 if (info
->assignment_
) {
664 code
= info
->label_
->index_
;
665 assigned_indexes_
[i
] = code
;
668 kind
= kVariable
+ i
;
670 if (i
< first_variable_index_
)
671 first_variable_index_
= i
;
674 if (kind
& kVariable
) ++variables_
;
675 hash
= hash_combine(hash
, code
);
676 hash
= hash_combine(hash
, kind
);
678 assigned_indexes_
[i
] = code
;
683 struct ShinglePatternLess
{
684 bool operator()(const ShinglePattern
& a
, const ShinglePattern
& b
) const {
685 return index_less(*a
.index_
, *b
.index_
);
687 ShinglePatternIndexLess index_less
;
690 struct ShinglePatternPointerLess
{
691 bool operator()(const ShinglePattern
* a
, const ShinglePattern
* b
) const {
692 return pattern_less(*a
, *b
);
694 ShinglePatternLess pattern_less
;
697 template<int (*Scorer
)(const ShinglePattern
*)>
698 struct OrderShinglePatternByScoreDescending
{
699 bool operator()(const ShinglePattern
* a
, const ShinglePattern
* b
) const {
700 int score_a
= Scorer(a
);
701 int score_b
= Scorer(b
);
702 if (score_a
> score_b
) return true;
703 if (score_a
< score_b
) return false;
704 return break_ties(a
, b
);
706 ShinglePatternPointerLess break_ties
;
709 // Returns a score for a 'Single Use' rule. Returns -1 if the rule is not
711 int SingleUseScore(const ShinglePattern
* pattern
) {
712 if (pattern
->index_
->variables_
!= 1)
715 if (pattern
->model_histogram_
.size() != 1 ||
716 pattern
->program_histogram_
.size() != 1)
719 // Does this pattern account for all uses of the variable?
720 const ShinglePattern::FreqView
& program_freq
=
721 *pattern
->program_histogram_
.begin();
722 const ShinglePattern::FreqView
& model_freq
=
723 *pattern
->model_histogram_
.begin();
724 int p1
= program_freq
.count();
725 int m1
= model_freq
.count();
727 const Shingle
* program_instance
= program_freq
.instance();
728 const Shingle
* model_instance
= model_freq
.instance();
729 size_t variable_index
= pattern
->index_
->first_variable_index_
;
730 LabelInfo
* program_info
= program_instance
->at(variable_index
);
731 LabelInfo
* model_info
= model_instance
->at(variable_index
);
732 if (!program_info
->assignment_
) {
733 if (program_info
->refs_
== p1
&& model_info
->refs_
== m1
) {
741 // The VariableQueue is a priority queue of unassigned LabelInfos from
742 // the 'program' (the 'variables') and their AssignmentCandidates.
743 class VariableQueue
{
745 typedef std::pair
<int, LabelInfo
*> ScoreAndLabel
;
749 bool empty() const { return queue_
.empty(); }
751 const ScoreAndLabel
& first() const { return *queue_
.begin(); }
753 // For debugging only.
755 for (Queue::const_iterator p
= queue_
.begin(); p
!= queue_
.end(); ++p
) {
756 AssignmentCandidates
* candidates
= p
->second
->candidates();
757 candidates
->Print(std::numeric_limits
<int>::max());
761 void AddPendingUpdate(LabelInfo
* program_info
, LabelInfo
* model_info
,
763 AssignmentCandidates
* candidates
= program_info
->candidates();
764 if (!candidates
->HasPendingUpdates()) {
765 pending_update_candidates_
.push_back(candidates
);
767 candidates
->AddPendingUpdate(model_info
, delta_score
);
770 void ApplyPendingUpdates() {
771 for (size_t i
= 0; i
< pending_update_candidates_
.size(); ++i
) {
772 AssignmentCandidates
* candidates
= pending_update_candidates_
[i
];
773 int old_score
= candidates
->TopScore();
774 queue_
.erase(ScoreAndLabel(old_score
, candidates
->program_info()));
775 candidates
->ApplyPendingUpdates();
776 if (!candidates
->empty()) {
777 int new_score
= candidates
->TopScore();
778 queue_
.insert(ScoreAndLabel(new_score
, candidates
->program_info()));
781 pending_update_candidates_
.clear();
785 struct OrderScoreAndLabelByScoreDecreasing
{
786 bool operator()(const ScoreAndLabel
& a
, const ScoreAndLabel
& b
) const {
787 if (a
.first
> b
.first
) return true;
788 if (a
.first
< b
.first
) return false;
789 return OrderLabelInfo()(a
.second
, b
.second
);
792 typedef std::set
<ScoreAndLabel
, OrderScoreAndLabelByScoreDecreasing
> Queue
;
795 std::vector
<AssignmentCandidates
*> pending_update_candidates_
;
797 DISALLOW_COPY_AND_ASSIGN(VariableQueue
);
801 class AssignmentProblem
{
803 AssignmentProblem(const Trace
& trace
, size_t model_end
)
805 model_end_(model_end
) {
806 VLOG(2) << "AssignmentProblem::AssignmentProblem " << model_end
<< ", "
811 if (model_end_
< Shingle::kWidth
||
812 trace_
.size() - model_end_
< Shingle::kWidth
) {
813 // Nothing much we can do with such a short problem.
816 instances_
.resize(trace_
.size() - Shingle::kWidth
+ 1, NULL
);
817 AddShingles(0, model_end_
);
818 AddShingles(model_end_
, trace_
.size());
820 AddPatternsNeedingUpdatesToQueues();
822 patterns_needing_updates_
.clear();
823 while (FindAndAssignBestLeader())
824 patterns_needing_updates_
.clear();
825 PrintActivePatterns();
831 typedef std::set
<Shingle
*, Shingle::PointerLess
> ShingleSet
;
833 typedef std::set
<const ShinglePattern
*, ShinglePatternPointerLess
>
836 // Patterns are partitioned into the following sets:
838 // * Retired patterns (not stored). No shingles exist for this pattern (they
839 // all now match more specialized patterns).
840 // * Useless patterns (not stored). There are no 'program' shingles for this
841 // pattern (they all now match more specialized patterns).
842 // * Single-use patterns - single_use_pattern_queue_.
843 // * Other patterns - active_non_single_use_patterns_ / variable_queue_.
845 typedef std::set
<const ShinglePattern
*,
846 OrderShinglePatternByScoreDescending
<&SingleUseScore
> >
847 SingleUsePatternQueue
;
849 void PrintPatternsHeader() const {
850 VLOG(2) << shingle_instances_
.size() << " instances "
851 << trace_
.size() << " trace length "
852 << patterns_
.size() << " shingle indexes "
853 << single_use_pattern_queue_
.size() << " single use patterns "
854 << active_non_single_use_patterns_
.size() << " active patterns";
857 void PrintActivePatterns() const {
858 for (ShinglePatternSet::const_iterator p
=
859 active_non_single_use_patterns_
.begin();
860 p
!= active_non_single_use_patterns_
.end();
862 const ShinglePattern
* pattern
= *p
;
863 VLOG(2) << ToString(pattern
, 10);
867 void PrintPatterns() const {
869 PrintActivePatterns();
873 void PrintAllPatterns() const {
874 for (IndexToPattern::const_iterator p
= patterns_
.begin();
875 p
!= patterns_
.end();
877 const ShinglePattern
& pattern
= p
->second
;
878 VLOG(2) << ToString(&pattern
, 10);
882 void PrintAllShingles() const {
883 for (Shingle::OwningSet::const_iterator p
= shingle_instances_
.begin();
884 p
!= shingle_instances_
.end();
886 const Shingle
& instance
= *p
;
887 VLOG(2) << ToString(&instance
) << " " << ToString(instance
.pattern());
892 void AddShingles(size_t begin
, size_t end
) {
893 for (size_t i
= begin
; i
+ Shingle::kWidth
- 1 < end
; ++i
) {
894 instances_
[i
] = Shingle::Find(trace_
, i
, &shingle_instances_
);
898 void Declassify(Shingle
* shingle
) {
899 ShinglePattern
* pattern
= shingle
->pattern();
900 if (shingle
->InModel()) {
901 pattern
->model_histogram_
.erase(ShinglePattern::FreqView(shingle
));
902 pattern
->model_coverage_
-= shingle
->position_count();
904 pattern
->program_histogram_
.erase(ShinglePattern::FreqView(shingle
));
905 pattern
->program_coverage_
-= shingle
->position_count();
907 shingle
->set_pattern(NULL
);
910 void Reclassify(Shingle
* shingle
) {
911 ShinglePattern
* pattern
= shingle
->pattern();
912 LOG_ASSERT(pattern
== NULL
);
914 ShinglePattern::Index
index(shingle
);
915 if (index
.variables_
== 0)
918 std::pair
<IndexToPattern::iterator
, bool> inserted
=
919 patterns_
.insert(std::make_pair(index
, ShinglePattern()));
921 pattern
= &inserted
.first
->second
;
922 pattern
->index_
= &inserted
.first
->first
;
923 shingle
->set_pattern(pattern
);
924 patterns_needing_updates_
.insert(pattern
);
926 if (shingle
->InModel()) {
927 pattern
->model_histogram_
.insert(ShinglePattern::FreqView(shingle
));
928 pattern
->model_coverage_
+= shingle
->position_count();
930 pattern
->program_histogram_
.insert(ShinglePattern::FreqView(shingle
));
931 pattern
->program_coverage_
+= shingle
->position_count();
935 void InitialClassify() {
936 for (Shingle::OwningSet::iterator p
= shingle_instances_
.begin();
937 p
!= shingle_instances_
.end();
939 // GCC's set<T>::iterator::operator *() returns a const object.
940 Reclassify(const_cast<Shingle
*>(&*p
));
944 // For the positions in |info|, find the shingles that overlap that position.
945 void AddAffectedPositions(LabelInfo
* info
, ShingleSet
* affected_shingles
) {
946 const size_t kWidth
= Shingle::kWidth
;
947 for (size_t i
= 0; i
< info
->positions_
.size(); ++i
) {
948 size_t position
= info
->positions_
[i
];
949 // Find bounds to the subrange of |trace_| we are in.
950 size_t start
= position
< model_end_
? 0 : model_end_
;
951 size_t end
= position
< model_end_
? model_end_
: trace_
.size();
953 // Clip [position-kWidth+1, position+1)
954 size_t low
= position
> start
+ kWidth
- 1
955 ? position
- kWidth
+ 1
957 size_t high
= position
+ kWidth
< end
? position
+ 1 : end
- kWidth
+ 1;
959 for (size_t shingle_position
= low
;
960 shingle_position
< high
;
961 ++shingle_position
) {
962 Shingle
* overlapping_shingle
= instances_
.at(shingle_position
);
963 affected_shingles
->insert(overlapping_shingle
);
968 void RemovePatternsNeedingUpdatesFromQueues() {
969 for (ShinglePatternSet::iterator p
= patterns_needing_updates_
.begin();
970 p
!= patterns_needing_updates_
.end();
972 RemovePatternFromQueues(*p
);
976 void AddPatternsNeedingUpdatesToQueues() {
977 for (ShinglePatternSet::iterator p
= patterns_needing_updates_
.begin();
978 p
!= patterns_needing_updates_
.end();
980 AddPatternToQueues(*p
);
982 variable_queue_
.ApplyPendingUpdates();
985 void RemovePatternFromQueues(const ShinglePattern
* pattern
) {
986 int single_use_score
= SingleUseScore(pattern
);
987 if (single_use_score
> 0) {
988 size_t n
= single_use_pattern_queue_
.erase(pattern
);
990 } else if (pattern
->program_histogram_
.empty() &&
991 pattern
->model_histogram_
.empty()) {
992 NOTREACHED(); // Should not come back to life.
993 } else if (pattern
->program_histogram_
.empty()) {
996 active_non_single_use_patterns_
.erase(pattern
);
997 AddPatternToLabelQueue(pattern
, -1);
1001 void AddPatternToQueues(const ShinglePattern
* pattern
) {
1002 int single_use_score
= SingleUseScore(pattern
);
1003 if (single_use_score
> 0) {
1004 single_use_pattern_queue_
.insert(pattern
);
1005 } else if (pattern
->program_histogram_
.empty() &&
1006 pattern
->model_histogram_
.empty()) {
1007 } else if (pattern
->program_histogram_
.empty()) {
1010 active_non_single_use_patterns_
.insert(pattern
);
1011 AddPatternToLabelQueue(pattern
, +1);
1015 void AddPatternToLabelQueue(const ShinglePattern
* pattern
, int sign
) {
1016 // For each possible assignment in this pattern, update the potential
1017 // contributions to the LabelInfo queues.
1019 // We want to find for each symbol (LabelInfo) the maximum contribution that
1020 // could be achieved by making shingle-wise assignments between shingles in
1021 // the model and shingles in the program.
1023 // If the shingles in the histograms are independent (no two shingles have a
1024 // symbol in common) then any permutation of the assignments is possible,
1025 // and the maximum contribution can be found by taking the maximum over all
1028 // If the shingles are dependent two things happen. The maximum
1029 // contribution to any given symbol is a sum because the symbol has
1030 // contributions from all the shingles containing it. Second, some
1031 // assignments are blocked by previous incompatible assignments. We want to
1032 // avoid a combinatorial search, so we ignore the blocking.
1034 const size_t kUnwieldy
= 5;
1036 typedef std::map
<LabelInfo
*, int> LabelToScore
;
1037 typedef std::map
<LabelInfo
*, LabelToScore
> ScoreSet
;
1040 size_t n_model_samples
= 0;
1041 for (ShinglePattern::Histogram::const_iterator model_iter
=
1042 pattern
->model_histogram_
.begin();
1043 model_iter
!= pattern
->model_histogram_
.end();
1045 if (++n_model_samples
> kUnwieldy
) break;
1046 const ShinglePattern::FreqView
& model_freq
= *model_iter
;
1047 int m1
= model_freq
.count();
1048 const Shingle
* model_instance
= model_freq
.instance();
1051 size_t n_program_samples
= 0;
1052 for (ShinglePattern::Histogram::const_iterator program_iter
=
1053 pattern
->program_histogram_
.begin();
1054 program_iter
!= pattern
->program_histogram_
.end();
1056 if (++n_program_samples
> kUnwieldy
) break;
1057 const ShinglePattern::FreqView
& program_freq
= *program_iter
;
1058 int p1
= program_freq
.count();
1059 const Shingle
* program_instance
= program_freq
.instance();
1061 // int score = p1; // ? weigh all equally??
1062 int score
= std::min(p1
, m1
);
1064 for (size_t i
= 0; i
< Shingle::kWidth
; ++i
) {
1065 LabelInfo
* program_info
= program_instance
->at(i
);
1066 LabelInfo
* model_info
= model_instance
->at(i
);
1067 if ((model_info
->assignment_
== NULL
) !=
1068 (program_info
->assignment_
== NULL
)) {
1069 VLOG(2) << "ERROR " << i
1070 << "\n\t" << ToString(pattern
, 10)
1071 << "\n\t" << ToString(program_instance
)
1072 << "\n\t" << ToString(model_instance
);
1074 if (!program_info
->assignment_
&& !model_info
->assignment_
) {
1075 sums
[program_info
][model_info
] += score
;
1079 for (ScoreSet::iterator assignee_iterator
= sums
.begin();
1080 assignee_iterator
!= sums
.end();
1081 ++assignee_iterator
) {
1082 LabelInfo
* program_info
= assignee_iterator
->first
;
1083 for (LabelToScore::iterator p
= assignee_iterator
->second
.begin();
1084 p
!= assignee_iterator
->second
.end();
1086 LabelInfo
* model_info
= p
->first
;
1087 int score
= p
->second
;
1088 int* slot
= &maxima
[program_info
][model_info
];
1089 *slot
= std::max(*slot
, score
);
1095 for (ScoreSet::iterator assignee_iterator
= maxima
.begin();
1096 assignee_iterator
!= maxima
.end();
1097 ++assignee_iterator
) {
1098 LabelInfo
* program_info
= assignee_iterator
->first
;
1099 for (LabelToScore::iterator p
= assignee_iterator
->second
.begin();
1100 p
!= assignee_iterator
->second
.end();
1102 LabelInfo
* model_info
= p
->first
;
1103 int score
= sign
* p
->second
;
1104 variable_queue_
.AddPendingUpdate(program_info
, model_info
, score
);
1109 void AssignOne(LabelInfo
* model_info
, LabelInfo
* program_info
) {
1110 LOG_ASSERT(!model_info
->assignment_
);
1111 LOG_ASSERT(!program_info
->assignment_
);
1112 LOG_ASSERT(model_info
->is_model_
);
1113 LOG_ASSERT(!program_info
->is_model_
);
1115 VLOG(3) << "Assign " << ToString(program_info
)
1116 << " := " << ToString(model_info
);
1118 ShingleSet affected_shingles
;
1119 AddAffectedPositions(model_info
, &affected_shingles
);
1120 AddAffectedPositions(program_info
, &affected_shingles
);
1122 for (ShingleSet::iterator p
= affected_shingles
.begin();
1123 p
!= affected_shingles
.end();
1125 patterns_needing_updates_
.insert((*p
)->pattern());
1128 RemovePatternsNeedingUpdatesFromQueues();
1130 for (ShingleSet::iterator p
= affected_shingles
.begin();
1131 p
!= affected_shingles
.end();
1136 program_info
->label_
->index_
= model_info
->label_
->index_
;
1138 model_info
->assignment_
= program_info
;
1139 program_info
->assignment_
= model_info
;
1141 for (ShingleSet::iterator p
= affected_shingles
.begin();
1142 p
!= affected_shingles
.end();
1147 AddPatternsNeedingUpdatesToQueues();
1150 bool AssignFirstVariableOfHistogramHead(const ShinglePattern
& pattern
) {
1151 const ShinglePattern::FreqView
& program_1
=
1152 *pattern
.program_histogram_
.begin();
1153 const ShinglePattern::FreqView
& model_1
= *pattern
.model_histogram_
.begin();
1154 const Shingle
* program_instance
= program_1
.instance();
1155 const Shingle
* model_instance
= model_1
.instance();
1156 size_t variable_index
= pattern
.index_
->first_variable_index_
;
1157 LabelInfo
* program_info
= program_instance
->at(variable_index
);
1158 LabelInfo
* model_info
= model_instance
->at(variable_index
);
1159 AssignOne(model_info
, program_info
);
1163 bool FindAndAssignBestLeader() {
1164 LOG_ASSERT(patterns_needing_updates_
.empty());
1166 if (!single_use_pattern_queue_
.empty()) {
1167 const ShinglePattern
& pattern
= **single_use_pattern_queue_
.begin();
1168 return AssignFirstVariableOfHistogramHead(pattern
);
1171 if (variable_queue_
.empty())
1174 const VariableQueue::ScoreAndLabel best
= variable_queue_
.first();
1175 int score
= best
.first
;
1176 LabelInfo
* assignee
= best
.second
;
1178 // TODO(sra): score (best.first) can be zero. A zero score means we are
1179 // blindly picking between two (or more) alternatives which look the same.
1180 // If we exit on the first zero-score we sometimes get 3-4% better total
1181 // compression. This indicates that 'infill' is doing a better job than
1182 // picking blindly. Perhaps we can use an extended region around the
1183 // undistinguished competing alternatives to break the tie.
1185 variable_queue_
.Print();
1189 AssignmentCandidates
* candidates
= assignee
->candidates();
1190 if (candidates
->empty())
1191 return false; // Should not happen.
1193 AssignOne(candidates
->top_candidate(), assignee
);
1198 // The trace vector contains the model sequence [0, model_end_) followed by
1199 // the program sequence [model_end_, trace.end())
1200 const Trace
& trace_
;
1203 // |shingle_instances_| is the set of 'interned' shingles.
1204 Shingle::OwningSet shingle_instances_
;
1206 // |instances_| maps from position in |trace_| to Shingle at that position.
1207 std::vector
<Shingle
*> instances_
;
1209 SingleUsePatternQueue single_use_pattern_queue_
;
1210 ShinglePatternSet active_non_single_use_patterns_
;
1211 VariableQueue variable_queue_
;
1213 // Transient information: when we make an assignment, we need to recompute
1214 // priority queue information derived from these ShinglePatterns.
1215 ShinglePatternSet patterns_needing_updates_
;
1217 typedef std::map
<ShinglePattern::Index
,
1218 ShinglePattern
, ShinglePatternIndexLess
> IndexToPattern
;
1219 IndexToPattern patterns_
;
1221 DISALLOW_COPY_AND_ASSIGN(AssignmentProblem
);
1224 class Adjuster
: public AdjustmentMethod
{
1226 Adjuster() : prog_(NULL
), model_(NULL
) {}
1229 bool Adjust(const AssemblyProgram
& model
, AssemblyProgram
* program
) {
1230 VLOG(1) << "Adjuster::Adjust";
1237 prog_
->UnassignIndexes();
1240 CollectTraces(model_
, &abs32_trace_
, &rel32_trace_
, true);
1241 size_t abs32_model_end
= abs32_trace_
.size();
1242 size_t rel32_model_end
= rel32_trace_
.size();
1243 CollectTraces(prog_
, &abs32_trace_
, &rel32_trace_
, false);
1244 Solve(abs32_trace_
, abs32_model_end
);
1245 Solve(rel32_trace_
, rel32_model_end
);
1246 prog_
->AssignRemainingIndexes();
1251 void CollectTraces(const AssemblyProgram
* program
, Trace
* abs32
, Trace
* rel32
,
1253 label_info_maker_
.ResetDebugLabel();
1254 const InstructionVector
& instructions
= program
->instructions();
1255 for (size_t i
= 0; i
< instructions
.size(); ++i
) {
1256 Instruction
* instruction
= instructions
[i
];
1257 if (Label
* label
= program
->InstructionAbs32Label(instruction
))
1258 ReferenceLabel(abs32
, label
, is_model
);
1259 if (Label
* label
= program
->InstructionRel32Label(instruction
))
1260 ReferenceLabel(rel32
, label
, is_model
);
1262 // TODO(sra): we could simply append all the labels in index order to
1263 // incorporate some costing for entropy (bigger deltas) that will be
1264 // introduced into the label address table by non-monotonic ordering. This
1265 // would have some knock-on effects to parts of the algorithm that work on
1266 // single-occurrence labels.
1269 void Solve(const Trace
& model
, size_t model_end
) {
1270 base::Time start_time
= base::Time::Now();
1271 AssignmentProblem
a(model
, model_end
);
1273 VLOG(1) << " Adjuster::Solve "
1274 << (base::Time::Now() - start_time
).InSecondsF();
1277 void ReferenceLabel(Trace
* trace
, Label
* label
, bool is_model
) {
1279 label_info_maker_
.MakeLabelInfo(label
, is_model
,
1280 static_cast<uint32
>(trace
->size())));
1283 AssemblyProgram
* prog_
; // Program to be adjusted, owned by caller.
1284 const AssemblyProgram
* model_
; // Program to be mimicked, owned by caller.
1286 LabelInfoMaker label_info_maker_
;
1289 DISALLOW_COPY_AND_ASSIGN(Adjuster
);
1292 ////////////////////////////////////////////////////////////////////////////////
1294 } // namespace adjustment_method_2
1296 AdjustmentMethod
* AdjustmentMethod::MakeShingleAdjustmentMethod() {
1297 return new adjustment_method_2::Adjuster();
1300 } // namespace courgette