2 * LibXDiff by Davide Libenzi ( File Differential Library )
3 * Copyright (C) 2003-2016 Davide Libenzi, Johannes E. Schindelin
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see
17 * <http://www.gnu.org/licenses/>.
19 * Davide Libenzi <davidel@xmailserver.org>
27 * The basic idea of patience diff is to find lines that are unique in
28 * both files. These are intuitively the ones that we want to see as
31 * The maximal ordered sequence of such line pairs (where ordered means
32 * that the order in the sequence agrees with the order of the lines in
33 * both files) naturally defines an initial set of common lines.
35 * Now, the algorithm tries to extend the set of common lines by growing
36 * the line ranges where the files have identical lines.
38 * Between those common lines, the patience diff algorithm is applied
39 * recursively, until no unique line pairs can be found; these line ranges
40 * are handled by the well-known Myers algorithm.
43 #define NON_UNIQUE ULONG_MAX
46 * This is a hash mapping from line hash to line numbers in the first and
54 * 0 = unused entry, 1 = first line, 2 = second, etc.
55 * line2 is NON_UNIQUE if the line is not unique
56 * in either the first or the second file.
58 unsigned long line1
, line2
;
60 * "next" & "previous" are used for the longest common
62 * initially, "next" reflects only the order in file1.
64 struct entry
*next
, *previous
;
67 * If 1, this entry can serve as an anchor. See
68 * Documentation/diff-options.txt for more information.
71 } *entries
, *first
, *last
;
72 /* were common records found? */
73 unsigned long has_matches
;
74 mmfile_t
*file1
, *file2
;
79 static int is_anchor(xpparam_t
const *xpp
, const char *line
)
82 for (i
= 0; i
< xpp
->anchors_nr
; i
++) {
83 if (!strncmp(line
, xpp
->anchors
[i
], strlen(xpp
->anchors
[i
])))
89 /* The argument "pass" is 1 for the first file, 2 for the second. */
90 static void insert_record(xpparam_t
const *xpp
, int line
, struct hashmap
*map
,
93 xrecord_t
**records
= pass
== 1 ?
94 map
->env
->xdf1
.recs
: map
->env
->xdf2
.recs
;
95 xrecord_t
*record
= records
[line
- 1], *other
;
97 * After xdl_prepare_env() (or more precisely, due to
98 * xdl_classify_record()), the "ha" member of the records (AKA lines)
99 * is _not_ the hash anymore, but a linearized version of it. In
100 * other words, the "ha" member is guaranteed to start with 0 and
101 * the second record's ha can only be 0 or 1, etc.
103 * So we multiply ha by 2 in the hope that the hashing was
106 int index
= (int)((record
->ha
<< 1) % map
->alloc
);
108 while (map
->entries
[index
].line1
) {
109 other
= map
->env
->xdf1
.recs
[map
->entries
[index
].line1
- 1];
110 if (map
->entries
[index
].hash
!= record
->ha
||
111 !xdl_recmatch(record
->ptr
, record
->size
,
112 other
->ptr
, other
->size
,
114 if (++index
>= map
->alloc
)
119 map
->has_matches
= 1;
120 if (pass
== 1 || map
->entries
[index
].line2
)
121 map
->entries
[index
].line2
= NON_UNIQUE
;
123 map
->entries
[index
].line2
= line
;
128 map
->entries
[index
].line1
= line
;
129 map
->entries
[index
].hash
= record
->ha
;
130 map
->entries
[index
].anchor
= is_anchor(xpp
, map
->env
->xdf1
.recs
[line
- 1]->ptr
);
132 map
->first
= map
->entries
+ index
;
134 map
->last
->next
= map
->entries
+ index
;
135 map
->entries
[index
].previous
= map
->last
;
137 map
->last
= map
->entries
+ index
;
142 * This function has to be called for each recursion into the inter-hunk
143 * parts, as previously non-unique lines can become unique when being
144 * restricted to a smaller part of the files.
146 * It is assumed that env has been prepared using xdl_prepare().
148 static int fill_hashmap(mmfile_t
*file1
, mmfile_t
*file2
,
149 xpparam_t
const *xpp
, xdfenv_t
*env
,
150 struct hashmap
*result
,
151 int line1
, int count1
, int line2
, int count2
)
153 result
->file1
= file1
;
154 result
->file2
= file2
;
158 /* We know exactly how large we want the hash map */
159 result
->alloc
= count1
* 2;
160 result
->entries
= (struct entry
*)
161 xdl_malloc(result
->alloc
* sizeof(struct entry
));
162 if (!result
->entries
)
164 memset(result
->entries
, 0, result
->alloc
* sizeof(struct entry
));
166 /* First, fill with entries from the first file */
168 insert_record(xpp
, line1
++, result
, 1);
170 /* Then search for matches in the second file */
172 insert_record(xpp
, line2
++, result
, 2);
178 * Find the longest sequence with a smaller last element (meaning a smaller
179 * line2, as we construct the sequence with entries ordered by line1).
181 static int binary_search(struct entry
**sequence
, int longest
,
184 int left
= -1, right
= longest
;
186 while (left
+ 1 < right
) {
187 int middle
= left
+ (right
- left
) / 2;
188 /* by construction, no two entries can be equal */
189 if (sequence
[middle
]->line2
> entry
->line2
)
194 /* return the index in "sequence", _not_ the sequence length */
199 * The idea is to start with the list of common unique lines sorted by
200 * the order in file1. For each of these pairs, the longest (partial)
201 * sequence whose last element's line2 is smaller is determined.
203 * For efficiency, the sequences are kept in a list containing exactly one
204 * item per sequence length: the sequence with the smallest last
205 * element (in terms of line2).
207 static struct entry
*find_longest_common_sequence(struct hashmap
*map
)
209 struct entry
**sequence
= xdl_malloc(map
->nr
* sizeof(struct entry
*));
214 * If not -1, this entry in sequence must never be overridden.
215 * Therefore, overriding entries before this has no effect, so
216 * do not do that either.
220 for (entry
= map
->first
; entry
; entry
= entry
->next
) {
221 if (!entry
->line2
|| entry
->line2
== NON_UNIQUE
)
223 i
= binary_search(sequence
, longest
, entry
);
224 entry
->previous
= i
< 0 ? NULL
: sequence
[i
];
231 longest
= anchor_i
+ 1;
232 } else if (i
== longest
) {
237 /* No common unique lines were found */
243 /* Iterate starting at the last element, adjusting the "next" members */
244 entry
= sequence
[longest
- 1];
246 while (entry
->previous
) {
247 entry
->previous
->next
= entry
;
248 entry
= entry
->previous
;
254 static int match(struct hashmap
*map
, int line1
, int line2
)
256 xrecord_t
*record1
= map
->env
->xdf1
.recs
[line1
- 1];
257 xrecord_t
*record2
= map
->env
->xdf2
.recs
[line2
- 1];
258 return xdl_recmatch(record1
->ptr
, record1
->size
,
259 record2
->ptr
, record2
->size
, map
->xpp
->flags
);
262 static int patience_diff(mmfile_t
*file1
, mmfile_t
*file2
,
263 xpparam_t
const *xpp
, xdfenv_t
*env
,
264 int line1
, int count1
, int line2
, int count2
);
266 static int walk_common_sequence(struct hashmap
*map
, struct entry
*first
,
267 int line1
, int count1
, int line2
, int count2
)
269 int end1
= line1
+ count1
, end2
= line2
+ count2
;
273 /* Try to grow the line ranges of common lines */
275 next1
= first
->line1
;
276 next2
= first
->line2
;
277 while (next1
> line1
&& next2
> line2
&&
278 match(map
, next1
- 1, next2
- 1)) {
286 while (line1
< next1
&& line2
< next2
&&
287 match(map
, line1
, line2
)) {
293 if (next1
> line1
|| next2
> line2
) {
294 struct hashmap submap
;
296 memset(&submap
, 0, sizeof(submap
));
297 if (patience_diff(map
->file1
, map
->file2
,
299 line1
, next1
- line1
,
300 line2
, next2
- line2
))
307 while (first
->next
&&
308 first
->next
->line1
== first
->line1
+ 1 &&
309 first
->next
->line2
== first
->line2
+ 1)
312 line1
= first
->line1
+ 1;
313 line2
= first
->line2
+ 1;
319 static int fall_back_to_classic_diff(struct hashmap
*map
,
320 int line1
, int count1
, int line2
, int count2
)
323 xpp
.flags
= map
->xpp
->flags
& ~XDF_DIFF_ALGORITHM_MASK
;
325 return xdl_fall_back_diff(map
->env
, &xpp
,
326 line1
, count1
, line2
, count2
);
330 * Recursively find the longest common sequence of unique lines,
331 * and if none was found, ask xdl_do_diff() to do the job.
333 * This function assumes that env was prepared with xdl_prepare_env().
335 static int patience_diff(mmfile_t
*file1
, mmfile_t
*file2
,
336 xpparam_t
const *xpp
, xdfenv_t
*env
,
337 int line1
, int count1
, int line2
, int count2
)
343 /* trivial case: one side is empty */
346 env
->xdf2
.rchg
[line2
++ - 1] = 1;
348 } else if (!count2
) {
350 env
->xdf1
.rchg
[line1
++ - 1] = 1;
354 memset(&map
, 0, sizeof(map
));
355 if (fill_hashmap(file1
, file2
, xpp
, env
, &map
,
356 line1
, count1
, line2
, count2
))
359 /* are there any matching lines at all? */
360 if (!map
.has_matches
) {
362 env
->xdf1
.rchg
[line1
++ - 1] = 1;
364 env
->xdf2
.rchg
[line2
++ - 1] = 1;
365 xdl_free(map
.entries
);
369 first
= find_longest_common_sequence(&map
);
371 result
= walk_common_sequence(&map
, first
,
372 line1
, count1
, line2
, count2
);
374 result
= fall_back_to_classic_diff(&map
,
375 line1
, count1
, line2
, count2
);
377 xdl_free(map
.entries
);
381 int xdl_do_patience_diff(mmfile_t
*file1
, mmfile_t
*file2
,
382 xpparam_t
const *xpp
, xdfenv_t
*env
)
384 if (xdl_prepare_env(file1
, file2
, xpp
, env
) < 0)
387 /* environment is cleaned up in xdl_diff() */
388 return patience_diff(file1
, file2
, xpp
, env
,
389 1, env
->xdf1
.nrec
, 1, env
->xdf2
.nrec
);