2 * LibXDiff by Davide Libenzi ( File Differential Library )
3 * Copyright (C) 2003-2016 Davide Libenzi, Johannes E. Schindelin
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see
17 * <http://www.gnu.org/licenses/>.
19 * Davide Libenzi <davidel@xmailserver.org>
25 * The basic idea of patience diff is to find lines that are unique in
26 * both files. These are intuitively the ones that we want to see as
29 * The maximal ordered sequence of such line pairs (where ordered means
30 * that the order in the sequence agrees with the order of the lines in
31 * both files) naturally defines an initial set of common lines.
33 * Now, the algorithm tries to extend the set of common lines by growing
34 * the line ranges where the files have identical lines.
36 * Between those common lines, the patience diff algorithm is applied
37 * recursively, until no unique line pairs can be found; these line ranges
38 * are handled by the well-known Myers algorithm.
41 #define NON_UNIQUE ULONG_MAX
44 * This is a hash mapping from line hash to line numbers in the first and
52 * 0 = unused entry, 1 = first line, 2 = second, etc.
53 * line2 is NON_UNIQUE if the line is not unique
54 * in either the first or the second file.
56 unsigned long line1
, line2
;
58 * "next" & "previous" are used for the longest common
60 * initially, "next" reflects only the order in file1.
62 struct entry
*next
, *previous
;
65 * If 1, this entry can serve as an anchor. See
66 * Documentation/diff-options.txt for more information.
69 } *entries
, *first
, *last
;
70 /* were common records found? */
71 unsigned long has_matches
;
76 static int is_anchor(xpparam_t
const *xpp
, const char *line
)
79 for (i
= 0; i
< xpp
->anchors_nr
; i
++) {
80 if (!strncmp(line
, xpp
->anchors
[i
], strlen(xpp
->anchors
[i
])))
86 /* The argument "pass" is 1 for the first file, 2 for the second. */
87 static void insert_record(xpparam_t
const *xpp
, int line
, struct hashmap
*map
,
90 xrecord_t
**records
= pass
== 1 ?
91 map
->env
->xdf1
.recs
: map
->env
->xdf2
.recs
;
92 xrecord_t
*record
= records
[line
- 1];
94 * After xdl_prepare_env() (or more precisely, due to
95 * xdl_classify_record()), the "ha" member of the records (AKA lines)
96 * is _not_ the hash anymore, but a linearized version of it. In
97 * other words, the "ha" member is guaranteed to start with 0 and
98 * the second record's ha can only be 0 or 1, etc.
100 * So we multiply ha by 2 in the hope that the hashing was
103 int index
= (int)((record
->ha
<< 1) % map
->alloc
);
105 while (map
->entries
[index
].line1
) {
106 if (map
->entries
[index
].hash
!= record
->ha
) {
107 if (++index
>= map
->alloc
)
112 map
->has_matches
= 1;
113 if (pass
== 1 || map
->entries
[index
].line2
)
114 map
->entries
[index
].line2
= NON_UNIQUE
;
116 map
->entries
[index
].line2
= line
;
121 map
->entries
[index
].line1
= line
;
122 map
->entries
[index
].hash
= record
->ha
;
123 map
->entries
[index
].anchor
= is_anchor(xpp
, map
->env
->xdf1
.recs
[line
- 1]->ptr
);
125 map
->first
= map
->entries
+ index
;
127 map
->last
->next
= map
->entries
+ index
;
128 map
->entries
[index
].previous
= map
->last
;
130 map
->last
= map
->entries
+ index
;
135 * This function has to be called for each recursion into the inter-hunk
136 * parts, as previously non-unique lines can become unique when being
137 * restricted to a smaller part of the files.
139 * It is assumed that env has been prepared using xdl_prepare().
141 static int fill_hashmap(xpparam_t
const *xpp
, xdfenv_t
*env
,
142 struct hashmap
*result
,
143 int line1
, int count1
, int line2
, int count2
)
148 /* We know exactly how large we want the hash map */
149 result
->alloc
= count1
* 2;
150 if (!XDL_CALLOC_ARRAY(result
->entries
, result
->alloc
))
153 /* First, fill with entries from the first file */
155 insert_record(xpp
, line1
++, result
, 1);
157 /* Then search for matches in the second file */
159 insert_record(xpp
, line2
++, result
, 2);
165 * Find the longest sequence with a smaller last element (meaning a smaller
166 * line2, as we construct the sequence with entries ordered by line1).
168 static int binary_search(struct entry
**sequence
, int longest
,
171 int left
= -1, right
= longest
;
173 while (left
+ 1 < right
) {
174 int middle
= left
+ (right
- left
) / 2;
175 /* by construction, no two entries can be equal */
176 if (sequence
[middle
]->line2
> entry
->line2
)
181 /* return the index in "sequence", _not_ the sequence length */
186 * The idea is to start with the list of common unique lines sorted by
187 * the order in file1. For each of these pairs, the longest (partial)
188 * sequence whose last element's line2 is smaller is determined.
190 * For efficiency, the sequences are kept in a list containing exactly one
191 * item per sequence length: the sequence with the smallest last
192 * element (in terms of line2).
194 static int find_longest_common_sequence(struct hashmap
*map
, struct entry
**res
)
196 struct entry
**sequence
;
201 * If not -1, this entry in sequence must never be overridden.
202 * Therefore, overriding entries before this has no effect, so
203 * do not do that either.
207 if (!XDL_ALLOC_ARRAY(sequence
, map
->nr
))
210 for (entry
= map
->first
; entry
; entry
= entry
->next
) {
211 if (!entry
->line2
|| entry
->line2
== NON_UNIQUE
)
213 i
= binary_search(sequence
, longest
, entry
);
214 entry
->previous
= i
< 0 ? NULL
: sequence
[i
];
221 longest
= anchor_i
+ 1;
222 } else if (i
== longest
) {
227 /* No common unique lines were found */
234 /* Iterate starting at the last element, adjusting the "next" members */
235 entry
= sequence
[longest
- 1];
237 while (entry
->previous
) {
238 entry
->previous
->next
= entry
;
239 entry
= entry
->previous
;
246 static int match(struct hashmap
*map
, int line1
, int line2
)
248 xrecord_t
*record1
= map
->env
->xdf1
.recs
[line1
- 1];
249 xrecord_t
*record2
= map
->env
->xdf2
.recs
[line2
- 1];
250 return record1
->ha
== record2
->ha
;
253 static int patience_diff(xpparam_t
const *xpp
, xdfenv_t
*env
,
254 int line1
, int count1
, int line2
, int count2
);
256 static int walk_common_sequence(struct hashmap
*map
, struct entry
*first
,
257 int line1
, int count1
, int line2
, int count2
)
259 int end1
= line1
+ count1
, end2
= line2
+ count2
;
263 /* Try to grow the line ranges of common lines */
265 next1
= first
->line1
;
266 next2
= first
->line2
;
267 while (next1
> line1
&& next2
> line2
&&
268 match(map
, next1
- 1, next2
- 1)) {
276 while (line1
< next1
&& line2
< next2
&&
277 match(map
, line1
, line2
)) {
283 if (next1
> line1
|| next2
> line2
) {
284 if (patience_diff(map
->xpp
, map
->env
,
285 line1
, next1
- line1
,
286 line2
, next2
- line2
))
293 while (first
->next
&&
294 first
->next
->line1
== first
->line1
+ 1 &&
295 first
->next
->line2
== first
->line2
+ 1)
298 line1
= first
->line1
+ 1;
299 line2
= first
->line2
+ 1;
305 static int fall_back_to_classic_diff(struct hashmap
*map
,
306 int line1
, int count1
, int line2
, int count2
)
310 memset(&xpp
, 0, sizeof(xpp
));
311 xpp
.flags
= map
->xpp
->flags
& ~XDF_DIFF_ALGORITHM_MASK
;
313 return xdl_fall_back_diff(map
->env
, &xpp
,
314 line1
, count1
, line2
, count2
);
318 * Recursively find the longest common sequence of unique lines,
319 * and if none was found, ask xdl_do_diff() to do the job.
321 * This function assumes that env was prepared with xdl_prepare_env().
323 static int patience_diff(xpparam_t
const *xpp
, xdfenv_t
*env
,
324 int line1
, int count1
, int line2
, int count2
)
330 /* trivial case: one side is empty */
333 env
->xdf2
.rchg
[line2
++ - 1] = 1;
335 } else if (!count2
) {
337 env
->xdf1
.rchg
[line1
++ - 1] = 1;
341 memset(&map
, 0, sizeof(map
));
342 if (fill_hashmap(xpp
, env
, &map
,
343 line1
, count1
, line2
, count2
))
346 /* are there any matching lines at all? */
347 if (!map
.has_matches
) {
349 env
->xdf1
.rchg
[line1
++ - 1] = 1;
351 env
->xdf2
.rchg
[line2
++ - 1] = 1;
352 xdl_free(map
.entries
);
356 result
= find_longest_common_sequence(&map
, &first
);
360 result
= walk_common_sequence(&map
, first
,
361 line1
, count1
, line2
, count2
);
363 result
= fall_back_to_classic_diff(&map
,
364 line1
, count1
, line2
, count2
);
366 xdl_free(map
.entries
);
370 int xdl_do_patience_diff(xpparam_t
const *xpp
, xdfenv_t
*env
)
372 return patience_diff(xpp
, env
, 1, env
->xdf1
.nrec
, 1, env
->xdf2
.nrec
);