exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / diffseq.h
blob0c5bc9cbc6da69e56f5bbbca3d7b472d57c76af8
1 /* Analyze differences between two vectors.
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2024 Free Software
4 Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
30 Algorithmica Vol. 1, 1986, pp. 251-266,
31 <https://doi.org/10.1007/BF01840446>.
32 See especially section 4.2, which describes the variation used below.
34 The basic algorithm was independently discovered as described in:
35 "Algorithms for Approximate String Matching", Esko Ukkonen,
36 Information and Control Vol. 64, 1985, pp. 100-118,
37 <https://doi.org/10.1016/S0019-9958(85)80046-2>.
39 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
40 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
41 at the price of producing suboptimal output for large inputs with
42 many differences. */
44 /* Before including this file, you need to define:
45 ELEMENT The element type of the vectors being compared.
46 EQUAL A two-argument macro that tests two elements for
47 equality.
48 OFFSET A signed integer type sufficient to hold the
49 difference between two indices. Usually
50 something like ptrdiff_t.
51 OFFSET_MAX (Optional) The maximum value of OFFSET (e.g.,
52 PTRDIFF_MAX). If omitted, it is inferred in a
53 way portable to the vast majority of C platforms,
54 as they lack padding bits.
55 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
56 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
57 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
58 NOTE_ORDERED (Optional) A boolean expression saying that
59 NOTE_DELETE and NOTE_INSERT calls must be
60 issued in offset order.
61 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
62 early abort of the computation.
63 USE_HEURISTIC (Optional) Define if you want to support the
64 heuristic for large vectors.
66 It is also possible to use this file with abstract arrays. In this case,
67 xvec and yvec are not represented in memory. They only exist conceptually.
68 In this case, the list of defines above is amended as follows:
69 ELEMENT Undefined.
70 EQUAL Undefined.
71 XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
72 A three-argument macro: References xvec[xoff] and
73 yvec[yoff] and tests these elements for equality.
75 Before including this file, you also need to include:
76 #include <limits.h>
77 #include "minmax.h"
80 /* Maximum value of type OFFSET. */
81 #ifndef OFFSET_MAX
82 # define OFFSET_MAX \
83 ((((OFFSET) 1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
84 #endif
86 /* Default to no early abort. */
87 #ifndef EARLY_ABORT
88 # define EARLY_ABORT(ctxt) false
89 #endif
91 #ifndef NOTE_ORDERED
92 # define NOTE_ORDERED false
93 #endif
95 /* Suppress gcc's "...may be used before initialized" warnings,
96 generated by GCC versions up to at least GCC 13.2. */
97 #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
98 # pragma GCC diagnostic push
99 # pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
100 #endif
103 * Context of comparison operation.
105 struct context
107 #ifdef ELEMENT
108 /* Vectors being compared. */
109 ELEMENT const *xvec;
110 ELEMENT const *yvec;
111 #endif
113 /* Extra fields. */
114 EXTRA_CONTEXT_FIELDS
116 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
117 furthest along the given diagonal in the forward search of the edit
118 matrix. */
119 OFFSET *fdiag;
121 /* Vector, indexed by diagonal, containing the X coordinate of the point
122 furthest along the given diagonal in the backward search of the edit
123 matrix. */
124 OFFSET *bdiag;
126 #ifdef USE_HEURISTIC
127 /* This corresponds to the diff --speed-large-files flag. With this
128 heuristic, for vectors with a constant small density of changes,
129 the algorithm is linear in the vector size. */
130 bool heuristic;
131 #endif
133 /* Edit scripts longer than this are too expensive to compute. */
134 OFFSET too_expensive;
136 /* Snakes bigger than this are considered "big". */
137 #define SNAKE_LIMIT 20
140 struct partition
142 /* Midpoints of this partition. */
143 OFFSET xmid;
144 OFFSET ymid;
146 /* True if low half will be analyzed minimally. */
147 bool lo_minimal;
149 /* Likewise for high half. */
150 bool hi_minimal;
154 /* Find the midpoint of the shortest edit script for a specified portion
155 of the two vectors.
157 Scan from the beginnings of the vectors, and simultaneously from the ends,
158 doing a breadth-first search through the space of edit-sequence.
159 When the two searches meet, we have found the midpoint of the shortest
160 edit sequence.
162 If FIND_MINIMAL is true, find the minimal edit script regardless of
163 expense. Otherwise, if the search is too expensive, use heuristics to
164 stop the search and report a suboptimal answer.
166 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
167 XMID - YMID equals the number of inserted elements minus the number
168 of deleted elements (counting only elements before the midpoint).
170 Set PART->lo_minimal to true iff the minimal edit script for the
171 left half of the partition is known; similarly for PART->hi_minimal.
173 This function assumes that the first elements of the specified portions
174 of the two vectors do not match, and likewise that the last elements do not
175 match. The caller must trim matching elements from the beginning and end
176 of the portions it is going to specify.
178 If we return the "wrong" partitions, the worst this can do is cause
179 suboptimal diff output. It cannot cause incorrect diff output. */
181 static void
182 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
183 struct partition *part, struct context *ctxt)
185 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
186 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
187 #ifdef ELEMENT
188 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
189 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
190 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
191 #else
192 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
193 #endif
194 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
195 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
196 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
197 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
198 OFFSET fmin = fmid;
199 OFFSET fmax = fmid; /* Limits of top-down search. */
200 OFFSET bmin = bmid;
201 OFFSET bmax = bmid; /* Limits of bottom-up search. */
202 OFFSET c; /* Cost. */
203 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
204 diagonal with respect to the northwest. */
206 fd[fmid] = xoff;
207 bd[bmid] = xlim;
209 for (c = 1;; ++c)
211 OFFSET d; /* Active diagonal. */
212 bool big_snake = false;
214 /* Extend the top-down search by an edit step in each diagonal. */
215 if (fmin > dmin)
216 fd[--fmin - 1] = -1;
217 else
218 ++fmin;
219 if (fmax < dmax)
220 fd[++fmax + 1] = -1;
221 else
222 --fmax;
223 for (d = fmax; d >= fmin; d -= 2)
225 OFFSET x;
226 OFFSET y;
227 OFFSET tlo = fd[d - 1];
228 OFFSET thi = fd[d + 1];
229 OFFSET x0 = tlo < thi ? thi : tlo + 1;
231 for (x = x0, y = x0 - d;
232 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
233 x++, y++)
234 continue;
235 if (x - x0 > SNAKE_LIMIT)
236 big_snake = true;
237 fd[d] = x;
238 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
240 part->xmid = x;
241 part->ymid = y;
242 part->lo_minimal = part->hi_minimal = true;
243 return;
247 /* Similarly extend the bottom-up search. */
248 if (bmin > dmin)
249 bd[--bmin - 1] = OFFSET_MAX;
250 else
251 ++bmin;
252 if (bmax < dmax)
253 bd[++bmax + 1] = OFFSET_MAX;
254 else
255 --bmax;
256 for (d = bmax; d >= bmin; d -= 2)
258 OFFSET x;
259 OFFSET y;
260 OFFSET tlo = bd[d - 1];
261 OFFSET thi = bd[d + 1];
262 OFFSET x0 = tlo < thi ? tlo : thi - 1;
264 for (x = x0, y = x0 - d;
265 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
266 x--, y--)
267 continue;
268 if (x0 - x > SNAKE_LIMIT)
269 big_snake = true;
270 bd[d] = x;
271 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
273 part->xmid = x;
274 part->ymid = y;
275 part->lo_minimal = part->hi_minimal = true;
276 return;
280 if (find_minimal)
281 continue;
283 #ifdef USE_HEURISTIC
284 bool heuristic = ctxt->heuristic;
285 #else
286 bool heuristic = false;
287 #endif
289 /* Heuristic: check occasionally for a diagonal that has made lots
290 of progress compared with the edit distance. If we have any
291 such, find the one that has made the most progress and return it
292 as if it had succeeded.
294 With this heuristic, for vectors with a constant small density
295 of changes, the algorithm is linear in the vector size. */
297 if (200 < c && big_snake && heuristic)
300 OFFSET best = 0;
302 for (d = fmax; d >= fmin; d -= 2)
304 OFFSET dd = d - fmid;
305 OFFSET x = fd[d];
306 OFFSET y = x - d;
307 OFFSET v = (x - xoff) * 2 - dd;
309 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
311 if (v > best
312 && xoff + SNAKE_LIMIT <= x && x < xlim
313 && yoff + SNAKE_LIMIT <= y && y < ylim)
315 /* We have a good enough best diagonal; now insist
316 that it end with a significant snake. */
317 int k;
319 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
320 if (k == SNAKE_LIMIT)
322 best = v;
323 part->xmid = x;
324 part->ymid = y;
325 break;
330 if (best > 0)
332 part->lo_minimal = true;
333 part->hi_minimal = false;
334 return;
339 OFFSET best = 0;
341 for (d = bmax; d >= bmin; d -= 2)
343 OFFSET dd = d - bmid;
344 OFFSET x = bd[d];
345 OFFSET y = x - d;
346 OFFSET v = (xlim - x) * 2 + dd;
348 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
350 if (v > best
351 && xoff < x && x <= xlim - SNAKE_LIMIT
352 && yoff < y && y <= ylim - SNAKE_LIMIT)
354 /* We have a good enough best diagonal; now insist
355 that it end with a significant snake. */
356 int k;
358 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
359 if (k == SNAKE_LIMIT - 1)
361 best = v;
362 part->xmid = x;
363 part->ymid = y;
364 break;
369 if (best > 0)
371 part->lo_minimal = false;
372 part->hi_minimal = true;
373 return;
378 /* Heuristic: if we've gone well beyond the call of duty, give up
379 and report halfway between our best results so far. */
380 if (c >= ctxt->too_expensive)
382 /* Find forward diagonal that maximizes X + Y. */
383 OFFSET fxybest = -1, fxbest;
384 for (d = fmax; d >= fmin; d -= 2)
386 OFFSET x = MIN (fd[d], xlim);
387 OFFSET y = x - d;
388 if (ylim < y)
390 x = ylim + d;
391 y = ylim;
393 if (fxybest < x + y)
395 fxybest = x + y;
396 fxbest = x;
400 /* Find backward diagonal that minimizes X + Y. */
401 OFFSET bxybest = OFFSET_MAX, bxbest;
402 for (d = bmax; d >= bmin; d -= 2)
404 OFFSET x = MAX (xoff, bd[d]);
405 OFFSET y = x - d;
406 if (y < yoff)
408 x = yoff + d;
409 y = yoff;
411 if (x + y < bxybest)
413 bxybest = x + y;
414 bxbest = x;
418 /* Use the better of the two diagonals. */
419 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
421 part->xmid = fxbest;
422 part->ymid = fxybest - fxbest;
423 part->lo_minimal = true;
424 part->hi_minimal = false;
426 else
428 part->xmid = bxbest;
429 part->ymid = bxybest - bxbest;
430 part->lo_minimal = false;
431 part->hi_minimal = true;
433 return;
436 #undef XREF_YREF_EQUAL
440 /* Compare in detail contiguous subsequences of the two vectors
441 which are known, as a whole, to match each other.
443 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
445 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
446 are origin-0.
448 If FIND_MINIMAL, find a minimal difference no matter how
449 expensive it is.
451 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
453 Return false if terminated normally, or true if terminated through early
454 abort. */
456 static bool
457 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
458 bool find_minimal, struct context *ctxt)
460 #ifdef ELEMENT
461 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
462 ELEMENT const *yv = ctxt->yvec;
463 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
464 #else
465 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
466 #endif
468 while (true)
470 /* Slide down the bottom initial diagonal. */
471 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
473 xoff++;
474 yoff++;
477 /* Slide up the top initial diagonal. */
478 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
480 xlim--;
481 ylim--;
484 /* Handle simple cases. */
485 if (xoff == xlim)
487 while (yoff < ylim)
489 NOTE_INSERT (ctxt, yoff);
490 if (EARLY_ABORT (ctxt))
491 return true;
492 yoff++;
494 break;
496 if (yoff == ylim)
498 while (xoff < xlim)
500 NOTE_DELETE (ctxt, xoff);
501 if (EARLY_ABORT (ctxt))
502 return true;
503 xoff++;
505 break;
508 struct partition part;
510 /* Find a point of correspondence in the middle of the vectors. */
511 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
513 /* Use the partitions to split this problem into subproblems. */
514 OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
515 bool find_minimal1, find_minimal2;
516 if (!NOTE_ORDERED
517 && ((xlim + ylim) - (part.xmid + part.ymid)
518 < (part.xmid + part.ymid) - (xoff + yoff)))
520 /* The second problem is smaller and the caller doesn't
521 care about order, so do the second problem first to
522 lessen recursion. */
523 xoff1 = part.xmid; xlim1 = xlim;
524 yoff1 = part.ymid; ylim1 = ylim;
525 find_minimal1 = part.hi_minimal;
527 xoff2 = xoff; xlim2 = part.xmid;
528 yoff2 = yoff; ylim2 = part.ymid;
529 find_minimal2 = part.lo_minimal;
531 else
533 xoff1 = xoff; xlim1 = part.xmid;
534 yoff1 = yoff; ylim1 = part.ymid;
535 find_minimal1 = part.lo_minimal;
537 xoff2 = part.xmid; xlim2 = xlim;
538 yoff2 = part.ymid; ylim2 = ylim;
539 find_minimal2 = part.hi_minimal;
542 /* Recurse to do one subproblem. */
543 bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
544 if (early)
545 return early;
547 /* Iterate to do the other subproblem. */
548 xoff = xoff2; xlim = xlim2;
549 yoff = yoff2; ylim = ylim2;
550 find_minimal = find_minimal2;
553 return false;
554 #undef XREF_YREF_EQUAL
557 #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
558 # pragma GCC diagnostic pop
559 #endif
561 #undef ELEMENT
562 #undef EQUAL
563 #undef OFFSET
564 #undef EXTRA_CONTEXT_FIELDS
565 #undef NOTE_DELETE
566 #undef NOTE_INSERT
567 #undef EARLY_ABORT
568 #undef USE_HEURISTIC
569 #undef XVECREF_YVECREF_EQUAL
570 #undef OFFSET_MAX