2 * LibXDiff by Davide Libenzi ( File Differential Library )
3 * Copyright (C) 2003-2016 Davide Libenzi, Johannes E. Schindelin
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see
17 * <http://www.gnu.org/licenses/>.
19 * Davide Libenzi <davidel@xmailserver.org>
25 * The basic idea of patience diff is to find lines that are unique in
26 * both files. These are intuitively the ones that we want to see as
29 * The maximal ordered sequence of such line pairs (where ordered means
30 * that the order in the sequence agrees with the order of the lines in
31 * both files) naturally defines an initial set of common lines.
33 * Now, the algorithm tries to extend the set of common lines by growing
34 * the line ranges where the files have identical lines.
36 * Between those common lines, the patience diff algorithm is applied
37 * recursively, until no unique line pairs can be found; these line ranges
38 * are handled by the well-known Myers algorithm.
41 #define NON_UNIQUE ULONG_MAX
44 * This is a hash mapping from line hash to line numbers in the first and
52 * 0 = unused entry, 1 = first line, 2 = second, etc.
53 * line2 is NON_UNIQUE if the line is not unique
54 * in either the first or the second file.
56 unsigned long line1
, line2
;
58 * "next" & "previous" are used for the longest common
60 * initially, "next" reflects only the order in file1.
62 struct entry
*next
, *previous
;
65 * If 1, this entry can serve as an anchor. See
66 * Documentation/diff-options.txt for more information.
69 } *entries
, *first
, *last
;
70 /* were common records found? */
71 unsigned long has_matches
;
72 mmfile_t
*file1
, *file2
;
77 static int is_anchor(xpparam_t
const *xpp
, const char *line
)
80 for (i
= 0; i
< xpp
->anchors_nr
; i
++) {
81 if (!strncmp(line
, xpp
->anchors
[i
], strlen(xpp
->anchors
[i
])))
87 /* The argument "pass" is 1 for the first file, 2 for the second. */
88 static void insert_record(xpparam_t
const *xpp
, int line
, struct hashmap
*map
,
91 xrecord_t
**records
= pass
== 1 ?
92 map
->env
->xdf1
.recs
: map
->env
->xdf2
.recs
;
93 xrecord_t
*record
= records
[line
- 1], *other
;
95 * After xdl_prepare_env() (or more precisely, due to
96 * xdl_classify_record()), the "ha" member of the records (AKA lines)
97 * is _not_ the hash anymore, but a linearized version of it. In
98 * other words, the "ha" member is guaranteed to start with 0 and
99 * the second record's ha can only be 0 or 1, etc.
101 * So we multiply ha by 2 in the hope that the hashing was
104 int index
= (int)((record
->ha
<< 1) % map
->alloc
);
106 while (map
->entries
[index
].line1
) {
107 other
= map
->env
->xdf1
.recs
[map
->entries
[index
].line1
- 1];
108 if (map
->entries
[index
].hash
!= record
->ha
||
109 !xdl_recmatch(record
->ptr
, record
->size
,
110 other
->ptr
, other
->size
,
112 if (++index
>= map
->alloc
)
117 map
->has_matches
= 1;
118 if (pass
== 1 || map
->entries
[index
].line2
)
119 map
->entries
[index
].line2
= NON_UNIQUE
;
121 map
->entries
[index
].line2
= line
;
126 map
->entries
[index
].line1
= line
;
127 map
->entries
[index
].hash
= record
->ha
;
128 map
->entries
[index
].anchor
= is_anchor(xpp
, map
->env
->xdf1
.recs
[line
- 1]->ptr
);
130 map
->first
= map
->entries
+ index
;
132 map
->last
->next
= map
->entries
+ index
;
133 map
->entries
[index
].previous
= map
->last
;
135 map
->last
= map
->entries
+ index
;
140 * This function has to be called for each recursion into the inter-hunk
141 * parts, as previously non-unique lines can become unique when being
142 * restricted to a smaller part of the files.
144 * It is assumed that env has been prepared using xdl_prepare().
146 static int fill_hashmap(mmfile_t
*file1
, mmfile_t
*file2
,
147 xpparam_t
const *xpp
, xdfenv_t
*env
,
148 struct hashmap
*result
,
149 int line1
, int count1
, int line2
, int count2
)
151 result
->file1
= file1
;
152 result
->file2
= file2
;
156 /* We know exactly how large we want the hash map */
157 result
->alloc
= count1
* 2;
158 result
->entries
= (struct entry
*)
159 xdl_malloc(result
->alloc
* sizeof(struct entry
));
160 if (!result
->entries
)
162 memset(result
->entries
, 0, result
->alloc
* sizeof(struct entry
));
164 /* First, fill with entries from the first file */
166 insert_record(xpp
, line1
++, result
, 1);
168 /* Then search for matches in the second file */
170 insert_record(xpp
, line2
++, result
, 2);
176 * Find the longest sequence with a smaller last element (meaning a smaller
177 * line2, as we construct the sequence with entries ordered by line1).
179 static int binary_search(struct entry
**sequence
, int longest
,
182 int left
= -1, right
= longest
;
184 while (left
+ 1 < right
) {
185 int middle
= left
+ (right
- left
) / 2;
186 /* by construction, no two entries can be equal */
187 if (sequence
[middle
]->line2
> entry
->line2
)
192 /* return the index in "sequence", _not_ the sequence length */
197 * The idea is to start with the list of common unique lines sorted by
198 * the order in file1. For each of these pairs, the longest (partial)
199 * sequence whose last element's line2 is smaller is determined.
201 * For efficiency, the sequences are kept in a list containing exactly one
202 * item per sequence length: the sequence with the smallest last
203 * element (in terms of line2).
205 static struct entry
*find_longest_common_sequence(struct hashmap
*map
)
207 struct entry
**sequence
= xdl_malloc(map
->nr
* sizeof(struct entry
*));
212 * If not -1, this entry in sequence must never be overridden.
213 * Therefore, overriding entries before this has no effect, so
214 * do not do that either.
218 for (entry
= map
->first
; entry
; entry
= entry
->next
) {
219 if (!entry
->line2
|| entry
->line2
== NON_UNIQUE
)
221 i
= binary_search(sequence
, longest
, entry
);
222 entry
->previous
= i
< 0 ? NULL
: sequence
[i
];
229 longest
= anchor_i
+ 1;
230 } else if (i
== longest
) {
235 /* No common unique lines were found */
241 /* Iterate starting at the last element, adjusting the "next" members */
242 entry
= sequence
[longest
- 1];
244 while (entry
->previous
) {
245 entry
->previous
->next
= entry
;
246 entry
= entry
->previous
;
252 static int match(struct hashmap
*map
, int line1
, int line2
)
254 xrecord_t
*record1
= map
->env
->xdf1
.recs
[line1
- 1];
255 xrecord_t
*record2
= map
->env
->xdf2
.recs
[line2
- 1];
256 return xdl_recmatch(record1
->ptr
, record1
->size
,
257 record2
->ptr
, record2
->size
, map
->xpp
->flags
);
260 static int patience_diff(mmfile_t
*file1
, mmfile_t
*file2
,
261 xpparam_t
const *xpp
, xdfenv_t
*env
,
262 int line1
, int count1
, int line2
, int count2
);
264 static int walk_common_sequence(struct hashmap
*map
, struct entry
*first
,
265 int line1
, int count1
, int line2
, int count2
)
267 int end1
= line1
+ count1
, end2
= line2
+ count2
;
271 /* Try to grow the line ranges of common lines */
273 next1
= first
->line1
;
274 next2
= first
->line2
;
275 while (next1
> line1
&& next2
> line2
&&
276 match(map
, next1
- 1, next2
- 1)) {
284 while (line1
< next1
&& line2
< next2
&&
285 match(map
, line1
, line2
)) {
291 if (next1
> line1
|| next2
> line2
) {
292 struct hashmap submap
;
294 memset(&submap
, 0, sizeof(submap
));
295 if (patience_diff(map
->file1
, map
->file2
,
297 line1
, next1
- line1
,
298 line2
, next2
- line2
))
305 while (first
->next
&&
306 first
->next
->line1
== first
->line1
+ 1 &&
307 first
->next
->line2
== first
->line2
+ 1)
310 line1
= first
->line1
+ 1;
311 line2
= first
->line2
+ 1;
317 static int fall_back_to_classic_diff(struct hashmap
*map
,
318 int line1
, int count1
, int line2
, int count2
)
322 memset(&xpp
, 0, sizeof(xpp
));
323 xpp
.flags
= map
->xpp
->flags
& ~XDF_DIFF_ALGORITHM_MASK
;
325 return xdl_fall_back_diff(map
->env
, &xpp
,
326 line1
, count1
, line2
, count2
);
330 * Recursively find the longest common sequence of unique lines,
331 * and if none was found, ask xdl_do_diff() to do the job.
333 * This function assumes that env was prepared with xdl_prepare_env().
335 static int patience_diff(mmfile_t
*file1
, mmfile_t
*file2
,
336 xpparam_t
const *xpp
, xdfenv_t
*env
,
337 int line1
, int count1
, int line2
, int count2
)
343 /* trivial case: one side is empty */
346 env
->xdf2
.rchg
[line2
++ - 1] = 1;
348 } else if (!count2
) {
350 env
->xdf1
.rchg
[line1
++ - 1] = 1;
354 memset(&map
, 0, sizeof(map
));
355 if (fill_hashmap(file1
, file2
, xpp
, env
, &map
,
356 line1
, count1
, line2
, count2
))
359 /* are there any matching lines at all? */
360 if (!map
.has_matches
) {
362 env
->xdf1
.rchg
[line1
++ - 1] = 1;
364 env
->xdf2
.rchg
[line2
++ - 1] = 1;
365 xdl_free(map
.entries
);
369 first
= find_longest_common_sequence(&map
);
371 result
= walk_common_sequence(&map
, first
,
372 line1
, count1
, line2
, count2
);
374 result
= fall_back_to_classic_diff(&map
,
375 line1
, count1
, line2
, count2
);
377 xdl_free(map
.entries
);
381 int xdl_do_patience_diff(mmfile_t
*file1
, mmfile_t
*file2
,
382 xpparam_t
const *xpp
, xdfenv_t
*env
)
384 if (xdl_prepare_env(file1
, file2
, xpp
, env
) < 0)
387 /* environment is cleaned up in xdl_diff() */
388 return patience_diff(file1
, file2
, xpp
, env
,
389 1, env
->xdf1
.nrec
, 1, env
->xdf2
.nrec
);