1 /* Find near-matches for strings.
2 Copyright (C) 2015-2022 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
22 #include "coretypes.h"
25 #include "spellcheck.h"
28 /* Cost of a case transformation. */
31 /* Cost of another kind of edit. */
34 /* Get the edit distance between the two strings: the minimal
35 number of edits that are needed to change one string into another,
36 where edits can be one-character insertions, removals, or substitutions,
37 or transpositions of two adjacent characters (counting as one "edit").
39 This implementation uses a modified variant of the Wagner-Fischer
40 algorithm for the Damerau-Levenshtein distance; specifically, the
41 "optimal string alignment distance" or "restricted edit distance"
42 variant. This implementation has been further modified to take
46 get_edit_distance (const char *s
, int len_s
,
47 const char *t
, int len_t
)
49 const bool debug
= false;
53 printf ("s: \"%s\" (len_s=%i)\n", s
, len_s
);
54 printf ("t: \"%s\" (len_t=%i)\n", t
, len_t
);
58 return BASE_COST
* len_t
;
60 return BASE_COST
* len_s
;
62 /* We effectively build a matrix where each (i, j) contains the
63 distance between the prefix strings s[0:j] and t[0:i].
64 Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
65 we simply keep track of the last two rows, v_one_ago and v_two_ago,
66 and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
67 allocation and memory accesses in favor of three (len_s + 1)
68 allocations. These could potentially be
69 statically-allocated if we impose a maximum length on the
70 strings of interest. */
71 edit_distance_t
*v_two_ago
= new edit_distance_t
[len_s
+ 1];
72 edit_distance_t
*v_one_ago
= new edit_distance_t
[len_s
+ 1];
73 edit_distance_t
*v_next
= new edit_distance_t
[len_s
+ 1];
75 /* The first row is for the case of an empty target string, which
76 we can reach by deleting every character in the source string. */
77 for (int i
= 0; i
< len_s
+ 1; i
++)
78 v_one_ago
[i
] = i
* BASE_COST
;
80 /* Build successive rows. */
81 for (int i
= 0; i
< len_t
; i
++)
85 printf ("i:%i v_one_ago = ", i
);
86 for (int j
= 0; j
< len_s
+ 1; j
++)
87 printf ("%i ", v_one_ago
[j
]);
91 /* The initial column is for the case of an empty source string; we
92 can reach prefixes of the target string of length i
93 by inserting i characters. */
94 v_next
[0] = (i
+ 1) * BASE_COST
;
96 /* Build the rest of the row by considering neighbors to
97 the north, west and northwest. */
98 for (int j
= 0; j
< len_s
; j
++)
100 edit_distance_t cost
;
104 else if (TOLOWER (s
[j
]) == TOLOWER (t
[i
]))
108 edit_distance_t deletion
= v_next
[j
] + BASE_COST
;
109 edit_distance_t insertion
= v_one_ago
[j
+ 1] + BASE_COST
;
110 edit_distance_t substitution
= v_one_ago
[j
] + cost
;
111 edit_distance_t cheapest
= MIN (deletion
, insertion
);
112 cheapest
= MIN (cheapest
, substitution
);
113 if (i
> 0 && j
> 0 && s
[j
] == t
[i
- 1] && s
[j
- 1] == t
[i
])
115 edit_distance_t transposition
= v_two_ago
[j
- 1] + BASE_COST
;
116 cheapest
= MIN (cheapest
, transposition
);
118 v_next
[j
+ 1] = cheapest
;
121 /* Prepare to move on to next row. */
122 for (int j
= 0; j
< len_s
+ 1; j
++)
124 v_two_ago
[j
] = v_one_ago
[j
];
125 v_one_ago
[j
] = v_next
[j
];
131 printf ("final v_next = ");
132 for (int j
= 0; j
< len_s
+ 1; j
++)
133 printf ("%i ", v_next
[j
]);
137 edit_distance_t result
= v_next
[len_s
];
144 /* Get the edit distance between two nil-terminated strings. */
147 get_edit_distance (const char *s
, const char *t
)
149 return get_edit_distance (s
, strlen (s
), t
, strlen (t
));
152 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
153 an autovec of non-NULL strings, determine which element within
154 CANDIDATES has the lowest edit distance to TARGET. If there are
155 multiple elements with the same minimal distance, the first in the
158 If more than half of the letters were misspelled, the suggestion is
159 likely to be meaningless, so return NULL for this case. */
162 find_closest_string (const char *target
,
163 const auto_vec
<const char *> *candidates
)
166 gcc_assert (candidates
);
169 const char *candidate
;
170 best_match
<const char *, const char *> bm (target
);
171 FOR_EACH_VEC_ELT (*candidates
, i
, candidate
)
173 gcc_assert (candidate
);
174 bm
.consider (candidate
);
177 return bm
.get_best_meaningful_candidate ();
180 /* Generate the maximum edit distance for which we consider a suggestion
181 to be meaningful, given a goal of length GOAL_LEN and a candidate of
182 length CANDIDATE_LEN.
184 This is a third of the length of the candidate or of the goal,
185 whichever is bigger. */
188 get_edit_distance_cutoff (size_t goal_len
, size_t candidate_len
)
190 size_t max_length
= MAX (goal_len
, candidate_len
);
191 size_t min_length
= MIN (goal_len
, candidate_len
);
193 gcc_assert (max_length
>= min_length
);
195 /* Special case: don't offer suggestions for a pair of
196 length == 1 strings (or empty strings). */
200 /* If the lengths are close, then round down. */
201 if (max_length
- min_length
<= 1)
202 /* ...but allow an edit distance of at least 1. */
203 return BASE_COST
* MAX (max_length
/ 3, 1);
205 /* Otherwise, round up (thus giving a little extra leeway to some cases
206 involving insertions/deletions). */
207 return BASE_COST
* (max_length
+ 2) / 3;
216 /* Verify that get_edit_distance (A, B) equals the expected value. */
219 test_get_edit_distance_one_way (const char *a
, const char *b
,
220 edit_distance_t expected
)
222 edit_distance_t actual
= get_edit_distance (a
, b
);
223 ASSERT_EQ (actual
, expected
);
227 get_edit_distance (A, B)
229 get_edit_distance (B, A)
230 equal the expected value, to ensure that the function is symmetric. */
233 test_get_edit_distance_both_ways (const char *a
, const char *b
,
234 edit_distance_t expected
)
236 test_get_edit_distance_one_way (a
, b
, expected
);
237 test_get_edit_distance_one_way (b
, a
, expected
);
240 /* Verify get_edit_distance for a variety of pairs of pre-canned
241 inputs, comparing against known-good values. */
244 test_edit_distances ()
246 test_get_edit_distance_both_ways ("", "nonempty",
247 BASE_COST
* strlen ("nonempty"));
248 test_get_edit_distance_both_ways ("saturday", "sunday",
250 test_get_edit_distance_both_ways ("foo", "m_foo", BASE_COST
* 2);
251 test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 4);
252 test_get_edit_distance_both_ways
253 ("the quick brown fox jumps over the lazy dog", "dog", BASE_COST
* 40);
254 test_get_edit_distance_both_ways
255 ("the quick brown fox jumps over the lazy dog",
256 "the quick brown dog jumps over the lazy fox",
258 test_get_edit_distance_both_ways
259 ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
260 "All your base are belong to us",
262 test_get_edit_distance_both_ways ("foo", "FOO", 3);
263 test_get_edit_distance_both_ways ("fee", "deed", BASE_COST
* 2);
264 test_get_edit_distance_both_ways ("coorzd1", "coordx1", BASE_COST
* 2);
265 test_get_edit_distance_both_ways ("assert", "sqrt", BASE_COST
* 3);
266 test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", BASE_COST
* 3);
267 test_get_edit_distance_both_ways ("time", "nice", BASE_COST
* 2);
268 test_get_edit_distance_both_ways ("bar", "carg", BASE_COST
* 2);
269 test_get_edit_distance_both_ways ("gtk_widget_show_all",
270 "GtkWidgetShowAll", 10);
271 test_get_edit_distance_both_ways ("m_bar", "bar", BASE_COST
* 2);
272 test_get_edit_distance_both_ways ("MACRO", "MACRAME", BASE_COST
* 3);
273 test_get_edit_distance_both_ways ("ab", "ac", BASE_COST
* 1);
274 test_get_edit_distance_both_ways ("ab", "a", BASE_COST
* 1);
275 test_get_edit_distance_both_ways ("a", "b", BASE_COST
* 1);
276 test_get_edit_distance_both_ways ("nanl", "name", BASE_COST
* 2);
277 test_get_edit_distance_both_ways ("char", "bar", BASE_COST
* 2);
278 test_get_edit_distance_both_ways ("-optimize", "fsanitize", BASE_COST
* 5);
279 test_get_edit_distance_both_ways ("__DATE__", "__i386__", BASE_COST
* 4);
281 /* Examples where transposition helps. */
282 test_get_edit_distance_both_ways ("ab", "ba", BASE_COST
* 1);
283 test_get_edit_distance_both_ways ("ba", "abc", BASE_COST
* 2);
284 test_get_edit_distance_both_ways ("coorzd1", "coordz1", BASE_COST
* 1);
285 test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
286 "bacdefghijklmnopqrstuvwxzy",
288 test_get_edit_distance_both_ways ("saturday", "sundya", BASE_COST
* 4);
289 test_get_edit_distance_both_ways ("signed", "singed", BASE_COST
* 1);
292 /* Subroutine of test_get_edit_distance_cutoff, for emulating the
293 spellchecking cutoff in up to GCC 8. */
295 static edit_distance_t
296 get_old_cutoff (size_t goal_len
, size_t candidate_len
)
298 return BASE_COST
* MAX (goal_len
, candidate_len
) / 2;
301 /* Verify that the cutoff for "meaningfulness" of suggestions is at least as
302 conservative as in older GCC releases.
304 This should ensure that we don't offer additional meaningless
305 suggestions (apart from those for which transposition has helped). */
308 test_get_edit_distance_cutoff ()
310 for (size_t goal_len
= 0; goal_len
< 30; goal_len
++)
311 for (size_t candidate_len
= 0; candidate_len
< 30; candidate_len
++)
312 ASSERT_TRUE (get_edit_distance_cutoff (goal_len
, candidate_len
)
313 <= get_old_cutoff (goal_len
, candidate_len
));
316 /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
319 assert_suggested_for (const location
&loc
, const char *candidate
,
322 auto_vec
<const char *> candidates
;
323 candidates
.safe_push (candidate
);
324 ASSERT_EQ_AT (loc
, candidate
, find_closest_string (target
, &candidates
));
327 /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
329 #define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET) \
330 SELFTEST_BEGIN_STMT \
331 assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
334 /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
337 assert_not_suggested_for (const location
&loc
, const char *candidate
,
340 auto_vec
<const char *> candidates
;
341 candidates
.safe_push (candidate
);
342 ASSERT_EQ_AT (loc
, NULL
, find_closest_string (target
, &candidates
));
345 /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
347 #define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET) \
348 SELFTEST_BEGIN_STMT \
349 assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
352 /* Verify that we offer varous suggestions that are meaningful,
353 and that we don't offer various other ones that aren't (PR c/82967). */
358 /* Good suggestions. */
360 ASSERT_SUGGESTED_FOR ("m_bar", "bar");
361 // dist == 2, max_length == 5, min_length == 3
363 ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
364 // dist == 3, max_length == 7, min_length == 5
366 ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
367 // dist == 7, max_length == 16, min_length = 19
369 ASSERT_SUGGESTED_FOR ("ab", "ac");
370 // dist == 1, max_length == min_length = 2
372 ASSERT_SUGGESTED_FOR ("ab", "a");
373 // dist == 1, max_length == 2, min_length = 1
375 /* Bad suggestions. */
377 ASSERT_NOT_SUGGESTED_FOR ("a", "b");
378 // dist == 1, max_length == min_length = 1
380 ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
381 // dist == 3, max_length 6, min_length == 4
383 ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
384 // dist == 3, max_length == min_length == 8
386 ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
387 ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
388 // dist == 2, max_length == min_length == 4
390 ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
391 ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
392 // dist == 2, max_length == 4, min_length == 3
394 ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
395 // dist == 5, max_length == min_length == 9
397 ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
398 // dist == 4, max_length == min_length == 8
400 ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
401 // dist == 9, max_length == 18, min_length == 11
404 /* Verify that find_closest_string is sane. */
407 test_find_closest_string ()
409 auto_vec
<const char *> candidates
;
411 /* Verify that it can handle an empty vec. */
412 ASSERT_EQ (NULL
, find_closest_string ("", &candidates
));
414 /* Verify that it works sanely for non-empty vecs. */
415 candidates
.safe_push ("apple");
416 candidates
.safe_push ("banana");
417 candidates
.safe_push ("cherry");
419 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates
));
420 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates
));
421 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates
));
422 ASSERT_EQ (NULL
, find_closest_string ("not like the others", &candidates
));
424 /* The order of the vec can matter, but it should not matter for these
426 candidates
.truncate (0);
427 candidates
.safe_push ("cherry");
428 candidates
.safe_push ("banana");
429 candidates
.safe_push ("apple");
430 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates
));
431 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates
));
432 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates
));
433 ASSERT_EQ (NULL
, find_closest_string ("not like the others", &candidates
));
435 /* If the goal string somehow makes it into the candidate list, offering
436 it as a suggestion will be nonsensical. Verify that we don't offer such
438 ASSERT_EQ (NULL
, find_closest_string ("banana", &candidates
));
440 /* Example from PR 69968 where transposition helps. */
441 candidates
.truncate (0);
442 candidates
.safe_push("coordx");
443 candidates
.safe_push("coordy");
444 candidates
.safe_push("coordz");
445 candidates
.safe_push("coordx1");
446 candidates
.safe_push("coordy1");
447 candidates
.safe_push("coordz1");
448 ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates
));
450 candidates
.truncate (0);
451 candidates
.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
452 candidates
.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
453 candidates
.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
454 ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
455 find_closest_string ("DWARF_GNAT_ENCODINGS_all",
458 /* The same as the previous test, but with a different order of
460 candidates
.truncate (0);
461 candidates
.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
462 candidates
.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
463 candidates
.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
464 ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
465 find_closest_string ("DWARF_GNAT_ENCODINGS_all",
468 /* Example from PR 105564 where option name with missing equal
470 candidates
.truncate (0);
471 candidates
.safe_push ("-Wtrivial-auto-var-init");
472 candidates
.safe_push ("-ftrivial-auto-var-init=");
473 ASSERT_STREQ ("-ftrivial-auto-var-init=",
474 find_closest_string ("-ftrivial-auto-var-init",
478 /* Test data for test_metric_conditions. */
480 static const char * const test_data
[] = {
485 "1234567890123456789012345678901234567890123456789012345678901234567890",
491 /* Verify that get_edit_distance appears to be a sane distance function,
492 even though it doesn't satisfy the conditions for being a metric. (This
493 is because the triangle inequality fails to hold: the distance between
494 "ca" and "ac" is 1, and so is the distance between "abc" and "ac", but
495 the distance between "abc" and "ca" is 3. Algorithms that calculate the
496 true Levenshtein-Damerau metric are much more expensive.) */
499 test_metric_conditions ()
501 const int num_test_cases
= ARRAY_SIZE (test_data
);
503 for (int i
= 0; i
< num_test_cases
; i
++)
505 for (int j
= 0; j
< num_test_cases
; j
++)
507 edit_distance_t dist_ij
508 = get_edit_distance (test_data
[i
], test_data
[j
]);
510 /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
512 ASSERT_EQ (dist_ij
, 0);
514 ASSERT_TRUE (dist_ij
> 0);
516 /* Symmetry: d(i, j) == d(j, i). */
517 edit_distance_t dist_ji
518 = get_edit_distance (test_data
[j
], test_data
[i
]);
519 ASSERT_EQ (dist_ij
, dist_ji
);
524 /* Run all of the selftests within this file. */
527 spellcheck_cc_tests ()
529 test_edit_distances ();
530 test_get_edit_distance_cutoff ();
532 test_find_closest_string ();
533 test_metric_conditions ();
536 } // namespace selftest
538 #endif /* #if CHECKING_P */