2 * LibXDiff by Davide Libenzi ( File Differential Library )
3 * Copyright (C) 2003-2009 Davide Libenzi, Johannes E. Schindelin
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * Davide Libenzi <davidel@xmailserver.org>
27 * The basic idea of patience diff is to find lines that are unique in
28 * both files. These are intuitively the ones that we want to see as
31 * The maximal ordered sequence of such line pairs (where ordered means
32 * that the order in the sequence agrees with the order of the lines in
33 * both files) naturally defines an initial set of common lines.
35 * Now, the algorithm tries to extend the set of common lines by growing
36 * the line ranges where the files have identical lines.
38 * Between those common lines, the patience diff algorithm is applied
39 * recursively, until no unique line pairs can be found; these line ranges
40 * are handled by the well-known Myers algorithm.
43 #define NON_UNIQUE ULONG_MAX
46 * This is a hash mapping from line hash to line numbers in the first and
54 * 0 = unused entry, 1 = first line, 2 = second, etc.
55 * line2 is NON_UNIQUE if the line is not unique
56 * in either the first or the second file.
58 unsigned long line1
, line2
;
60 * "next" & "previous" are used for the longest common
62 * initially, "next" reflects only the order in file1.
64 struct entry
*next
, *previous
;
65 } *entries
, *first
, *last
;
66 /* were common records found? */
67 unsigned long has_matches
;
68 mmfile_t
*file1
, *file2
;
73 /* The argument "pass" is 1 for the first file, 2 for the second. */
74 static void insert_record(int line
, struct hashmap
*map
, int pass
)
76 xrecord_t
**records
= pass
== 1 ?
77 map
->env
->xdf1
.recs
: map
->env
->xdf2
.recs
;
78 xrecord_t
*record
= records
[line
- 1], *other
;
80 * After xdl_prepare_env() (or more precisely, due to
81 * xdl_classify_record()), the "ha" member of the records (AKA lines)
82 * is _not_ the hash anymore, but a linearized version of it. In
83 * other words, the "ha" member is guaranteed to start with 0 and
84 * the second record's ha can only be 0 or 1, etc.
86 * So we multiply ha by 2 in the hope that the hashing was
89 int index
= (int)((record
->ha
<< 1) % map
->alloc
);
91 while (map
->entries
[index
].line1
) {
92 other
= map
->env
->xdf1
.recs
[map
->entries
[index
].line1
- 1];
93 if (map
->entries
[index
].hash
!= record
->ha
||
94 !xdl_recmatch(record
->ptr
, record
->size
,
95 other
->ptr
, other
->size
,
97 if (++index
>= map
->alloc
)
102 map
->has_matches
= 1;
103 if (pass
== 1 || map
->entries
[index
].line2
)
104 map
->entries
[index
].line2
= NON_UNIQUE
;
106 map
->entries
[index
].line2
= line
;
111 map
->entries
[index
].line1
= line
;
112 map
->entries
[index
].hash
= record
->ha
;
114 map
->first
= map
->entries
+ index
;
116 map
->last
->next
= map
->entries
+ index
;
117 map
->entries
[index
].previous
= map
->last
;
119 map
->last
= map
->entries
+ index
;
124 * This function has to be called for each recursion into the inter-hunk
125 * parts, as previously non-unique lines can become unique when being
126 * restricted to a smaller part of the files.
128 * It is assumed that env has been prepared using xdl_prepare().
130 static int fill_hashmap(mmfile_t
*file1
, mmfile_t
*file2
,
131 xpparam_t
const *xpp
, xdfenv_t
*env
,
132 struct hashmap
*result
,
133 int line1
, int count1
, int line2
, int count2
)
135 result
->file1
= file1
;
136 result
->file2
= file2
;
140 /* We know exactly how large we want the hash map */
141 result
->alloc
= count1
* 2;
142 result
->entries
= (struct entry
*)
143 xdl_malloc(result
->alloc
* sizeof(struct entry
));
144 if (!result
->entries
)
146 memset(result
->entries
, 0, result
->alloc
* sizeof(struct entry
));
148 /* First, fill with entries from the first file */
150 insert_record(line1
++, result
, 1);
152 /* Then search for matches in the second file */
154 insert_record(line2
++, result
, 2);
160 * Find the longest sequence with a smaller last element (meaning a smaller
161 * line2, as we construct the sequence with entries ordered by line1).
163 static int binary_search(struct entry
**sequence
, int longest
,
166 int left
= -1, right
= longest
;
168 while (left
+ 1 < right
) {
169 int middle
= (left
+ right
) / 2;
170 /* by construction, no two entries can be equal */
171 if (sequence
[middle
]->line2
> entry
->line2
)
176 /* return the index in "sequence", _not_ the sequence length */
181 * The idea is to start with the list of common unique lines sorted by
182 * the order in file1. For each of these pairs, the longest (partial)
183 * sequence whose last element's line2 is smaller is determined.
185 * For efficiency, the sequences are kept in a list containing exactly one
186 * item per sequence length: the sequence with the smallest last
187 * element (in terms of line2).
189 static struct entry
*find_longest_common_sequence(struct hashmap
*map
)
191 struct entry
**sequence
= xdl_malloc(map
->nr
* sizeof(struct entry
*));
195 for (entry
= map
->first
; entry
; entry
= entry
->next
) {
196 if (!entry
->line2
|| entry
->line2
== NON_UNIQUE
)
198 i
= binary_search(sequence
, longest
, entry
);
199 entry
->previous
= i
< 0 ? NULL
: sequence
[i
];
200 sequence
[++i
] = entry
;
205 /* No common unique lines were found */
211 /* Iterate starting at the last element, adjusting the "next" members */
212 entry
= sequence
[longest
- 1];
214 while (entry
->previous
) {
215 entry
->previous
->next
= entry
;
216 entry
= entry
->previous
;
222 static int match(struct hashmap
*map
, int line1
, int line2
)
224 xrecord_t
*record1
= map
->env
->xdf1
.recs
[line1
- 1];
225 xrecord_t
*record2
= map
->env
->xdf2
.recs
[line2
- 1];
226 return xdl_recmatch(record1
->ptr
, record1
->size
,
227 record2
->ptr
, record2
->size
, map
->xpp
->flags
);
230 static int patience_diff(mmfile_t
*file1
, mmfile_t
*file2
,
231 xpparam_t
const *xpp
, xdfenv_t
*env
,
232 int line1
, int count1
, int line2
, int count2
);
234 static int walk_common_sequence(struct hashmap
*map
, struct entry
*first
,
235 int line1
, int count1
, int line2
, int count2
)
237 int end1
= line1
+ count1
, end2
= line2
+ count2
;
241 /* Try to grow the line ranges of common lines */
243 next1
= first
->line1
;
244 next2
= first
->line2
;
245 while (next1
> line1
&& next2
> line2
&&
246 match(map
, next1
- 1, next2
- 1)) {
254 while (line1
< next1
&& line2
< next2
&&
255 match(map
, line1
, line2
)) {
261 if (next1
> line1
|| next2
> line2
) {
262 struct hashmap submap
;
264 memset(&submap
, 0, sizeof(submap
));
265 if (patience_diff(map
->file1
, map
->file2
,
267 line1
, next1
- line1
,
268 line2
, next2
- line2
))
275 while (first
->next
&&
276 first
->next
->line1
== first
->line1
+ 1 &&
277 first
->next
->line2
== first
->line2
+ 1)
280 line1
= first
->line1
+ 1;
281 line2
= first
->line2
+ 1;
287 static int fall_back_to_classic_diff(struct hashmap
*map
,
288 int line1
, int count1
, int line2
, int count2
)
291 * This probably does not work outside Git, since
292 * we have a very simple mmfile structure.
294 * Note: ideally, we would reuse the prepared environment, but
295 * the libxdiff interface does not (yet) allow for diffing only
296 * ranges of lines instead of the whole files.
298 mmfile_t subfile1
, subfile2
;
302 subfile1
.ptr
= (char *)map
->env
->xdf1
.recs
[line1
- 1]->ptr
;
303 subfile1
.size
= map
->env
->xdf1
.recs
[line1
+ count1
- 2]->ptr
+
304 map
->env
->xdf1
.recs
[line1
+ count1
- 2]->size
- subfile1
.ptr
;
305 subfile2
.ptr
= (char *)map
->env
->xdf2
.recs
[line2
- 1]->ptr
;
306 subfile2
.size
= map
->env
->xdf2
.recs
[line2
+ count2
- 2]->ptr
+
307 map
->env
->xdf2
.recs
[line2
+ count2
- 2]->size
- subfile2
.ptr
;
308 xpp
.flags
= map
->xpp
->flags
& ~XDF_PATIENCE_DIFF
;
309 if (xdl_do_diff(&subfile1
, &subfile2
, &xpp
, &env
) < 0)
312 memcpy(map
->env
->xdf1
.rchg
+ line1
- 1, env
.xdf1
.rchg
, count1
);
313 memcpy(map
->env
->xdf2
.rchg
+ line2
- 1, env
.xdf2
.rchg
, count2
);
321 * Recursively find the longest common sequence of unique lines,
322 * and if none was found, ask xdl_do_diff() to do the job.
324 * This function assumes that env was prepared with xdl_prepare_env().
326 static int patience_diff(mmfile_t
*file1
, mmfile_t
*file2
,
327 xpparam_t
const *xpp
, xdfenv_t
*env
,
328 int line1
, int count1
, int line2
, int count2
)
334 /* trivial case: one side is empty */
337 env
->xdf2
.rchg
[line2
++ - 1] = 1;
339 } else if (!count2
) {
341 env
->xdf1
.rchg
[line1
++ - 1] = 1;
345 memset(&map
, 0, sizeof(map
));
346 if (fill_hashmap(file1
, file2
, xpp
, env
, &map
,
347 line1
, count1
, line2
, count2
))
350 /* are there any matching lines at all? */
351 if (!map
.has_matches
) {
353 env
->xdf1
.rchg
[line1
++ - 1] = 1;
355 env
->xdf2
.rchg
[line2
++ - 1] = 1;
356 xdl_free(map
.entries
);
360 first
= find_longest_common_sequence(&map
);
362 result
= walk_common_sequence(&map
, first
,
363 line1
, count1
, line2
, count2
);
365 result
= fall_back_to_classic_diff(&map
,
366 line1
, count1
, line2
, count2
);
368 xdl_free(map
.entries
);
372 int xdl_do_patience_diff(mmfile_t
*file1
, mmfile_t
*file2
,
373 xpparam_t
const *xpp
, xdfenv_t
*env
)
375 if (xdl_prepare_env(file1
, file2
, xpp
, env
) < 0)
378 /* environment is cleaned up in xdl_diff() */
379 return patience_diff(file1
, file2
, xpp
, env
,
380 1, env
->xdf1
.nrec
, 1, env
->xdf2
.nrec
);