2 * lcs.c : routines for creating an lcs
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
21 * ====================================================================
26 #include <apr_pools.h>
27 #include <apr_general.h>
33 * Calculate the Longest Common Subsequence (LCS) between two datasources.
34 * This function is what makes the diff code tick.
36 * The LCS algorithm implemented here is based on the approach described
37 * by Sun Wu, Udi Manber and Gene Meyers in "An O(NP) Sequence Comparison
38 * Algorithm", but has been modified for better performance.
40 * Let M and N be the lengths (number of tokens) of the two sources
41 * ('files'). The goal is to reach the end of both sources (files) with the
42 * minimum number of insertions + deletions. Since there is a known length
43 * difference N-M between the files, that is equivalent to just the minimum
44 * number of deletions, or equivalently the minimum number of insertions.
45 * For symmetry, we use the lesser number - deletions if M<N, insertions if
48 * Let 'k' be the difference in remaining length between the files, i.e.
49 * if we're at the beginning of both files, k=N-M, whereas k=0 for the
50 * 'end state', at the end of both files. An insertion will increase k by
51 * one, while a deletion decreases k by one. If k<0, then insertions are
52 * 'free' - we need those to reach the end state k=0 anyway - but deletions
53 * are costly: Adding a deletion means that we will have to add an additional
54 * insertion later to reach the end state, so it doesn't matter if we count
55 * deletions or insertions. Similarly, deletions are free for k>0.
57 * Let a 'state' be a given position in each file {pos1, pos2}. An array
58 * 'fp' keeps track of the best possible state (largest values of
59 * {pos1, pos2}) that can be achieved for a given cost 'p' (# moves away
60 * from k=0), as well as a linked list of what matches were used to reach
61 * that state. For each new value of p, we find for each value of k the
62 * best achievable state for that k - either by doing a costly operation
63 * (deletion if k<0) from a state achieved at a lower p, or doing a free
64 * operation (insertion if k<0) from a state achieved at the same p -
65 * and in both cases advancing past any matching regions found. This is
66 * handled by running loops over k in order of descending absolute value.
68 * A recent improvement of the algorithm is to ignore tokens that are unique
69 * to one file or the other, as those are known from the start to be
70 * impossible to match.
73 typedef struct svn_diff__snake_t svn_diff__snake_t
;
75 struct svn_diff__snake_t
79 svn_diff__position_t
*position
[2];
82 static APR_INLINE
void
83 svn_diff__snake(svn_diff__snake_t
*fp_k
,
84 svn_diff__token_index_t
*token_counts
[2],
85 svn_diff__lcs_t
**freelist
,
88 svn_diff__position_t
*start_position
[2];
89 svn_diff__position_t
*position
[2];
91 svn_diff__lcs_t
*previous_lcs
;
93 /* The previous entry at fp[k] is going to be replaced. See if we
94 * can mark that lcs node for reuse, because the sequence up to this
95 * point was a dead end.
104 previous_lcs
= lcs
->next
;
105 lcs
->next
= *freelist
;
110 if (fp_k
[-1].y
>= fp_k
[1].y
)
112 start_position
[0] = fp_k
[-1].position
[0];
113 start_position
[1] = fp_k
[-1].position
[1]->next
;
115 previous_lcs
= fp_k
[-1].lcs
;
119 start_position
[0] = fp_k
[1].position
[0]->next
;
120 start_position
[1] = fp_k
[1].position
[1];
122 previous_lcs
= fp_k
[1].lcs
;
128 previous_lcs
->refcount
++;
131 /* ### Optimization, skip all positions that don't have matchpoints
132 * ### anyway. Beware of the sentinel, don't skip it!
135 position
[0] = start_position
[0];
136 position
[1] = start_position
[1];
140 while (position
[0]->token_index
== position
[1]->token_index
)
142 position
[0] = position
[0]->next
;
143 position
[1] = position
[1]->next
;
146 if (position
[1] != start_position
[1])
151 *freelist
= lcs
->next
;
155 lcs
= apr_palloc(pool
, sizeof(*lcs
));
158 lcs
->position
[0] = start_position
[0];
159 lcs
->position
[1] = start_position
[1];
160 lcs
->length
= position
[1]->offset
- start_position
[1]->offset
;
161 lcs
->next
= previous_lcs
;
164 start_position
[0] = position
[0];
165 start_position
[1] = position
[1];
168 /* Skip any and all tokens that only occur in one of the files */
169 if (position
[0]->token_index
>= 0
170 && token_counts
[1][position
[0]->token_index
] == 0)
171 start_position
[0] = position
[0] = position
[0]->next
;
172 else if (position
[1]->token_index
>= 0
173 && token_counts
[0][position
[1]->token_index
] == 0)
174 start_position
[1] = position
[1] = position
[1]->next
;
179 fp_k
[0].lcs
= previous_lcs
;
180 fp_k
[0].position
[0] = position
[0];
181 fp_k
[0].position
[1] = position
[1];
183 fp_k
[0].y
= position
[1]->offset
;
187 static svn_diff__lcs_t
*
188 svn_diff__lcs_reverse(svn_diff__lcs_t
*lcs
)
190 svn_diff__lcs_t
*next
;
191 svn_diff__lcs_t
*prev
;
206 /* Prepends a new lcs chunk for the amount of LINES at the given positions
207 * POS0_OFFSET and POS1_OFFSET to the given LCS chain, and returns it.
208 * This function assumes LINES > 0. */
209 static svn_diff__lcs_t
*
210 prepend_lcs(svn_diff__lcs_t
*lcs
, apr_off_t lines
,
211 apr_off_t pos0_offset
, apr_off_t pos1_offset
,
214 svn_diff__lcs_t
*new_lcs
;
216 SVN_ERR_ASSERT_NO_RETURN(lines
> 0);
218 new_lcs
= apr_palloc(pool
, sizeof(*new_lcs
));
219 new_lcs
->position
[0] = apr_pcalloc(pool
, sizeof(*new_lcs
->position
[0]));
220 new_lcs
->position
[0]->offset
= pos0_offset
;
221 new_lcs
->position
[1] = apr_pcalloc(pool
, sizeof(*new_lcs
->position
[1]));
222 new_lcs
->position
[1]->offset
= pos1_offset
;
223 new_lcs
->length
= lines
;
224 new_lcs
->refcount
= 1;
232 svn_diff__lcs(svn_diff__position_t
*position_list1
, /* pointer to tail (ring) */
233 svn_diff__position_t
*position_list2
, /* pointer to tail (ring) */
234 svn_diff__token_index_t
*token_counts_list1
, /* array of counts */
235 svn_diff__token_index_t
*token_counts_list2
, /* array of counts */
236 svn_diff__token_index_t num_tokens
,
237 apr_off_t prefix_lines
,
238 apr_off_t suffix_lines
,
242 svn_diff__token_index_t
*token_counts
[2];
243 svn_diff__token_index_t unique_count
[2];
244 svn_diff__token_index_t token_index
;
245 svn_diff__snake_t
*fp
;
249 svn_diff__lcs_t
*lcs
, *lcs_freelist
= NULL
;
251 svn_diff__position_t sentinel_position
[2];
253 /* Since EOF is always a sync point we tack on an EOF link
254 * with sentinel positions
256 lcs
= apr_palloc(pool
, sizeof(*lcs
));
257 lcs
->position
[0] = apr_pcalloc(pool
, sizeof(*lcs
->position
[0]));
258 lcs
->position
[0]->offset
= position_list1
259 ? position_list1
->offset
+ suffix_lines
+ 1
260 : prefix_lines
+ suffix_lines
+ 1;
261 lcs
->position
[1] = apr_pcalloc(pool
, sizeof(*lcs
->position
[1]));
262 lcs
->position
[1]->offset
= position_list2
263 ? position_list2
->offset
+ suffix_lines
+ 1
264 : prefix_lines
+ suffix_lines
+ 1;
269 if (position_list1
== NULL
|| position_list2
== NULL
)
272 lcs
= prepend_lcs(lcs
, suffix_lines
,
273 lcs
->position
[0]->offset
- suffix_lines
,
274 lcs
->position
[1]->offset
- suffix_lines
,
277 lcs
= prepend_lcs(lcs
, prefix_lines
, 1, 1, pool
);
282 unique_count
[1] = unique_count
[0] = 0;
283 for (token_index
= 0; token_index
< num_tokens
; token_index
++)
285 if (token_counts_list1
[token_index
] == 0)
286 unique_count
[1] += token_counts_list2
[token_index
];
287 if (token_counts_list2
[token_index
] == 0)
288 unique_count
[0] += token_counts_list1
[token_index
];
291 /* Calculate lengths M and N of the sequences to be compared. Do not
292 * count tokens unique to one file, as those are ignored in __snake.
294 length
[0] = position_list1
->offset
- position_list1
->next
->offset
+ 1
296 length
[1] = position_list2
->offset
- position_list2
->next
->offset
+ 1
299 /* strikerXXX: here we allocate the furthest point array, which is
300 * strikerXXX: sized M + N + 3 (!)
302 fp
= apr_pcalloc(pool
,
303 sizeof(*fp
) * (apr_size_t
)(length
[0] + length
[1] + 3));
305 /* The origo of fp corresponds to the end state, where we are
306 * at the end of both files. The valid states thus span from
307 * -N (at end of first file and at the beginning of the second
308 * file) to +M (the opposite :). Finally, svn_diff__snake needs
309 * 1 extra slot on each side to work.
313 sentinel_position
[0].next
= position_list1
->next
;
314 position_list1
->next
= &sentinel_position
[0];
315 sentinel_position
[0].offset
= position_list1
->offset
+ 1;
316 token_counts
[0] = token_counts_list1
;
318 sentinel_position
[1].next
= position_list2
->next
;
319 position_list2
->next
= &sentinel_position
[1];
320 sentinel_position
[1].offset
= position_list2
->offset
+ 1;
321 token_counts
[1] = token_counts_list2
;
323 /* Negative indices will not be used elsewhere
325 sentinel_position
[0].token_index
= -1;
326 sentinel_position
[1].token_index
= -2;
328 /* position d = M - N corresponds to the initial state, where
329 * we are at the beginning of both files.
331 d
= length
[0] - length
[1];
333 /* k = d - 1 will be the first to be used to get previous
334 * position information from, make sure it holds sane
337 fp
[d
- 1].position
[0] = sentinel_position
[0].next
;
338 fp
[d
- 1].position
[1] = &sentinel_position
[1];
343 /* For k < 0, insertions are free */
344 for (k
= (d
< 0 ? d
: 0) - p
; k
< 0; k
++)
346 svn_diff__snake(fp
+ k
, token_counts
, &lcs_freelist
, pool
);
348 /* for k > 0, deletions are free */
349 for (k
= (d
> 0 ? d
: 0) + p
; k
>= 0; k
--)
351 svn_diff__snake(fp
+ k
, token_counts
, &lcs_freelist
, pool
);
356 while (fp
[0].position
[1] != &sentinel_position
[1]);
359 lcs
->next
= prepend_lcs(fp
[0].lcs
, suffix_lines
,
360 lcs
->position
[0]->offset
- suffix_lines
,
361 lcs
->position
[1]->offset
- suffix_lines
,
364 lcs
->next
= fp
[0].lcs
;
366 lcs
= svn_diff__lcs_reverse(lcs
);
368 position_list1
->next
= sentinel_position
[0].next
;
369 position_list2
->next
= sentinel_position
[1].next
;
372 return prepend_lcs(lcs
, prefix_lines
, 1, 1, pool
);