fstat: Fix module dependency conditions.
[gnulib/ericb.git] / lib / diffseq.h
blobd7a374357c7bd4533fc807966fb392b449b2d52c
1 /* Analyze differences between two vectors.
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2017 Free Software
4 Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
30 Algorithmica Vol. 1, 1986, pp. 251-266,
31 <http://dx.doi.org/10.1007/BF01840446>.
32 See especially section 4.2, which describes the variation used below.
34 The basic algorithm was independently discovered as described in:
35 "Algorithms for Approximate String Matching", Esko Ukkonen,
36 Information and Control Vol. 64, 1985, pp. 100-118,
37 <http://dx.doi.org/10.1016/S0019-9958(85)80046-2>.
39 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
40 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
41 at the price of producing suboptimal output for large inputs with
42 many differences. */
44 /* Before including this file, you need to define:
45 ELEMENT The element type of the vectors being compared.
46 EQUAL A two-argument macro that tests two elements for
47 equality.
48 OFFSET A signed integer type sufficient to hold the
49 difference between two indices. Usually
50 something like ptrdiff_t.
51 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
52 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
53 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
54 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
55 early abort of the computation.
56 USE_HEURISTIC (Optional) Define if you want to support the
57 heuristic for large vectors.
58 It is also possible to use this file with abstract arrays. In this case,
59 xvec and yvec are not represented in memory. They only exist conceptually.
60 In this case, the list of defines above is amended as follows:
61 ELEMENT Undefined.
62 EQUAL Undefined.
63 XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
64 A three-argument macro: References xvec[xoff] and
65 yvec[yoff] and tests these elements for equality.
66 Before including this file, you also need to include:
67 #include <limits.h>
68 #include <stdbool.h>
69 #include "minmax.h"
72 /* Maximum value of type OFFSET. */
73 #define OFFSET_MAX \
74 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
76 /* Default to no early abort. */
77 #ifndef EARLY_ABORT
78 # define EARLY_ABORT(ctxt) false
79 #endif
81 /* Use this to suppress gcc's "...may be used before initialized" warnings.
82 Beware: The Code argument must not contain commas. */
83 #ifndef IF_LINT
84 # if defined GCC_LINT || defined lint
85 # define IF_LINT(Code) Code
86 # else
87 # define IF_LINT(Code) /* empty */
88 # endif
89 #endif
91 /* As above, but when Code must contain one comma. */
92 #ifndef IF_LINT2
93 # if defined GCC_LINT || defined lint
94 # define IF_LINT2(Code1, Code2) Code1, Code2
95 # else
96 # define IF_LINT2(Code1, Code2) /* empty */
97 # endif
98 #endif
101 * Context of comparison operation.
103 struct context
105 #ifdef ELEMENT
106 /* Vectors being compared. */
107 ELEMENT const *xvec;
108 ELEMENT const *yvec;
109 #endif
111 /* Extra fields. */
112 EXTRA_CONTEXT_FIELDS
114 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
115 furthest along the given diagonal in the forward search of the edit
116 matrix. */
117 OFFSET *fdiag;
119 /* Vector, indexed by diagonal, containing the X coordinate of the point
120 furthest along the given diagonal in the backward search of the edit
121 matrix. */
122 OFFSET *bdiag;
124 #ifdef USE_HEURISTIC
125 /* This corresponds to the diff --speed-large-files flag. With this
126 heuristic, for vectors with a constant small density of changes,
127 the algorithm is linear in the vector size. */
128 bool heuristic;
129 #endif
131 /* Edit scripts longer than this are too expensive to compute. */
132 OFFSET too_expensive;
134 /* Snakes bigger than this are considered "big". */
135 #define SNAKE_LIMIT 20
138 struct partition
140 /* Midpoints of this partition. */
141 OFFSET xmid;
142 OFFSET ymid;
144 /* True if low half will be analyzed minimally. */
145 bool lo_minimal;
147 /* Likewise for high half. */
148 bool hi_minimal;
152 /* Find the midpoint of the shortest edit script for a specified portion
153 of the two vectors.
155 Scan from the beginnings of the vectors, and simultaneously from the ends,
156 doing a breadth-first search through the space of edit-sequence.
157 When the two searches meet, we have found the midpoint of the shortest
158 edit sequence.
160 If FIND_MINIMAL is true, find the minimal edit script regardless of
161 expense. Otherwise, if the search is too expensive, use heuristics to
162 stop the search and report a suboptimal answer.
164 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
165 XMID - YMID equals the number of inserted elements minus the number
166 of deleted elements (counting only elements before the midpoint).
168 Set PART->lo_minimal to true iff the minimal edit script for the
169 left half of the partition is known; similarly for PART->hi_minimal.
171 This function assumes that the first elements of the specified portions
172 of the two vectors do not match, and likewise that the last elements do not
173 match. The caller must trim matching elements from the beginning and end
174 of the portions it is going to specify.
176 If we return the "wrong" partitions, the worst this can do is cause
177 suboptimal diff output. It cannot cause incorrect diff output. */
179 static void
180 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
181 struct partition *part, struct context *ctxt)
183 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
184 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
185 #ifdef ELEMENT
186 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
187 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
188 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
189 #else
190 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
191 #endif
192 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
193 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
194 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
195 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
196 OFFSET fmin = fmid;
197 OFFSET fmax = fmid; /* Limits of top-down search. */
198 OFFSET bmin = bmid;
199 OFFSET bmax = bmid; /* Limits of bottom-up search. */
200 OFFSET c; /* Cost. */
201 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
202 diagonal with respect to the northwest. */
204 fd[fmid] = xoff;
205 bd[bmid] = xlim;
207 for (c = 1;; ++c)
209 OFFSET d; /* Active diagonal. */
210 bool big_snake = false;
212 /* Extend the top-down search by an edit step in each diagonal. */
213 if (fmin > dmin)
214 fd[--fmin - 1] = -1;
215 else
216 ++fmin;
217 if (fmax < dmax)
218 fd[++fmax + 1] = -1;
219 else
220 --fmax;
221 for (d = fmax; d >= fmin; d -= 2)
223 OFFSET x;
224 OFFSET y;
225 OFFSET tlo = fd[d - 1];
226 OFFSET thi = fd[d + 1];
227 OFFSET x0 = tlo < thi ? thi : tlo + 1;
229 for (x = x0, y = x0 - d;
230 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
231 x++, y++)
232 continue;
233 if (x - x0 > SNAKE_LIMIT)
234 big_snake = true;
235 fd[d] = x;
236 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
238 part->xmid = x;
239 part->ymid = y;
240 part->lo_minimal = part->hi_minimal = true;
241 return;
245 /* Similarly extend the bottom-up search. */
246 if (bmin > dmin)
247 bd[--bmin - 1] = OFFSET_MAX;
248 else
249 ++bmin;
250 if (bmax < dmax)
251 bd[++bmax + 1] = OFFSET_MAX;
252 else
253 --bmax;
254 for (d = bmax; d >= bmin; d -= 2)
256 OFFSET x;
257 OFFSET y;
258 OFFSET tlo = bd[d - 1];
259 OFFSET thi = bd[d + 1];
260 OFFSET x0 = tlo < thi ? tlo : thi - 1;
262 for (x = x0, y = x0 - d;
263 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
264 x--, y--)
265 continue;
266 if (x0 - x > SNAKE_LIMIT)
267 big_snake = true;
268 bd[d] = x;
269 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
271 part->xmid = x;
272 part->ymid = y;
273 part->lo_minimal = part->hi_minimal = true;
274 return;
278 if (find_minimal)
279 continue;
281 #ifdef USE_HEURISTIC
282 /* Heuristic: check occasionally for a diagonal that has made lots
283 of progress compared with the edit distance. If we have any
284 such, find the one that has made the most progress and return it
285 as if it had succeeded.
287 With this heuristic, for vectors with a constant small density
288 of changes, the algorithm is linear in the vector size. */
290 if (200 < c && big_snake && ctxt->heuristic)
293 OFFSET best = 0;
295 for (d = fmax; d >= fmin; d -= 2)
297 OFFSET dd = d - fmid;
298 OFFSET x = fd[d];
299 OFFSET y = x - d;
300 OFFSET v = (x - xoff) * 2 - dd;
302 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
304 if (v > best
305 && xoff + SNAKE_LIMIT <= x && x < xlim
306 && yoff + SNAKE_LIMIT <= y && y < ylim)
308 /* We have a good enough best diagonal; now insist
309 that it end with a significant snake. */
310 int k;
312 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
313 if (k == SNAKE_LIMIT)
315 best = v;
316 part->xmid = x;
317 part->ymid = y;
318 break;
323 if (best > 0)
325 part->lo_minimal = true;
326 part->hi_minimal = false;
327 return;
332 OFFSET best = 0;
334 for (d = bmax; d >= bmin; d -= 2)
336 OFFSET dd = d - bmid;
337 OFFSET x = bd[d];
338 OFFSET y = x - d;
339 OFFSET v = (xlim - x) * 2 + dd;
341 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
343 if (v > best
344 && xoff < x && x <= xlim - SNAKE_LIMIT
345 && yoff < y && y <= ylim - SNAKE_LIMIT)
347 /* We have a good enough best diagonal; now insist
348 that it end with a significant snake. */
349 int k;
351 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
352 if (k == SNAKE_LIMIT - 1)
354 best = v;
355 part->xmid = x;
356 part->ymid = y;
357 break;
362 if (best > 0)
364 part->lo_minimal = false;
365 part->hi_minimal = true;
366 return;
370 #endif /* USE_HEURISTIC */
372 /* Heuristic: if we've gone well beyond the call of duty, give up
373 and report halfway between our best results so far. */
374 if (c >= ctxt->too_expensive)
376 OFFSET fxybest;
377 OFFSET fxbest IF_LINT (= 0);
378 OFFSET bxybest;
379 OFFSET bxbest IF_LINT (= 0);
381 /* Find forward diagonal that maximizes X + Y. */
382 fxybest = -1;
383 for (d = fmax; d >= fmin; d -= 2)
385 OFFSET x = MIN (fd[d], xlim);
386 OFFSET y = x - d;
387 if (ylim < y)
389 x = ylim + d;
390 y = ylim;
392 if (fxybest < x + y)
394 fxybest = x + y;
395 fxbest = x;
399 /* Find backward diagonal that minimizes X + Y. */
400 bxybest = OFFSET_MAX;
401 for (d = bmax; d >= bmin; d -= 2)
403 OFFSET x = MAX (xoff, bd[d]);
404 OFFSET y = x - d;
405 if (y < yoff)
407 x = yoff + d;
408 y = yoff;
410 if (x + y < bxybest)
412 bxybest = x + y;
413 bxbest = x;
417 /* Use the better of the two diagonals. */
418 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
420 part->xmid = fxbest;
421 part->ymid = fxybest - fxbest;
422 part->lo_minimal = true;
423 part->hi_minimal = false;
425 else
427 part->xmid = bxbest;
428 part->ymid = bxybest - bxbest;
429 part->lo_minimal = false;
430 part->hi_minimal = true;
432 return;
435 #undef XREF_YREF_EQUAL
439 /* Compare in detail contiguous subsequences of the two vectors
440 which are known, as a whole, to match each other.
442 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
444 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
445 are origin-0.
447 If FIND_MINIMAL, find a minimal difference no matter how
448 expensive it is.
450 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
452 Return false if terminated normally, or true if terminated through early
453 abort. */
455 static bool
456 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
457 bool find_minimal, struct context *ctxt)
459 #ifdef ELEMENT
460 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
461 ELEMENT const *yv = ctxt->yvec;
462 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
463 #else
464 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
465 #endif
467 /* Slide down the bottom initial diagonal. */
468 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
470 xoff++;
471 yoff++;
474 /* Slide up the top initial diagonal. */
475 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
477 xlim--;
478 ylim--;
481 /* Handle simple cases. */
482 if (xoff == xlim)
483 while (yoff < ylim)
485 NOTE_INSERT (ctxt, yoff);
486 if (EARLY_ABORT (ctxt))
487 return true;
488 yoff++;
490 else if (yoff == ylim)
491 while (xoff < xlim)
493 NOTE_DELETE (ctxt, xoff);
494 if (EARLY_ABORT (ctxt))
495 return true;
496 xoff++;
498 else
500 struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
502 /* Find a point of correspondence in the middle of the vectors. */
503 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
505 /* Use the partitions to split this problem into subproblems. */
506 if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
507 return true;
508 if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
509 return true;
512 return false;
513 #undef XREF_YREF_EQUAL
516 #undef ELEMENT
517 #undef EQUAL
518 #undef OFFSET
519 #undef EXTRA_CONTEXT_FIELDS
520 #undef NOTE_DELETE
521 #undef NOTE_INSERT
522 #undef EARLY_ABORT
523 #undef USE_HEURISTIC
524 #undef XVECREF_YVECREF_EQUAL
525 #undef OFFSET_MAX