Daily bump.
[official-gcc.git] / gcc / spellcheck.c
blobc39c5073a26cdd6ac60912327e92c5448f751ae1
1 /* Find near-matches for strings.
2 Copyright (C) 2015-2021 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "spellcheck.h"
26 #include "selftest.h"
28 /* Cost of a case transformation. */
29 #define CASE_COST 1
31 /* Cost of another kind of edit. */
32 #define BASE_COST 2
34 /* Get the edit distance between the two strings: the minimal
35 number of edits that are needed to change one string into another,
36 where edits can be one-character insertions, removals, or substitutions,
37 or transpositions of two adjacent characters (counting as one "edit").
39 This implementation uses a modified variant of the Wagner-Fischer
40 algorithm for the Damerau-Levenshtein distance; specifically, the
41 "optimal string alignment distance" or "restricted edit distance"
42 variant. This implementation has been further modified to take
43 case into account. */
45 edit_distance_t
46 get_edit_distance (const char *s, int len_s,
47 const char *t, int len_t)
49 const bool debug = false;
51 if (debug)
53 printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
54 printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
57 if (len_s == 0)
58 return BASE_COST * len_t;
59 if (len_t == 0)
60 return BASE_COST * len_s;
62 /* We effectively build a matrix where each (i, j) contains the
63 distance between the prefix strings s[0:j] and t[0:i].
64 Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
65 we simply keep track of the last two rows, v_one_ago and v_two_ago,
66 and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
67 allocation and memory accesses in favor of three (len_s + 1)
68 allocations. These could potentially be
69 statically-allocated if we impose a maximum length on the
70 strings of interest. */
71 edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
72 edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
73 edit_distance_t *v_next = new edit_distance_t[len_s + 1];
75 /* The first row is for the case of an empty target string, which
76 we can reach by deleting every character in the source string. */
77 for (int i = 0; i < len_s + 1; i++)
78 v_one_ago[i] = i * BASE_COST;
80 /* Build successive rows. */
81 for (int i = 0; i < len_t; i++)
83 if (debug)
85 printf ("i:%i v_one_ago = ", i);
86 for (int j = 0; j < len_s + 1; j++)
87 printf ("%i ", v_one_ago[j]);
88 printf ("\n");
91 /* The initial column is for the case of an empty source string; we
92 can reach prefixes of the target string of length i
93 by inserting i characters. */
94 v_next[0] = (i + 1) * BASE_COST;
96 /* Build the rest of the row by considering neighbors to
97 the north, west and northwest. */
98 for (int j = 0; j < len_s; j++)
100 edit_distance_t cost;
102 if (s[j] == t[i])
103 cost = 0;
104 else if (TOLOWER (s[j]) == TOLOWER (t[i]))
105 cost = CASE_COST;
106 else
107 cost = BASE_COST;
108 edit_distance_t deletion = v_next[j] + BASE_COST;
109 edit_distance_t insertion = v_one_ago[j + 1] + BASE_COST;
110 edit_distance_t substitution = v_one_ago[j] + cost;
111 edit_distance_t cheapest = MIN (deletion, insertion);
112 cheapest = MIN (cheapest, substitution);
113 if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
115 edit_distance_t transposition = v_two_ago[j - 1] + BASE_COST;
116 cheapest = MIN (cheapest, transposition);
118 v_next[j + 1] = cheapest;
121 /* Prepare to move on to next row. */
122 for (int j = 0; j < len_s + 1; j++)
124 v_two_ago[j] = v_one_ago[j];
125 v_one_ago[j] = v_next[j];
129 if (debug)
131 printf ("final v_next = ");
132 for (int j = 0; j < len_s + 1; j++)
133 printf ("%i ", v_next[j]);
134 printf ("\n");
137 edit_distance_t result = v_next[len_s];
138 delete[] v_two_ago;
139 delete[] v_one_ago;
140 delete[] v_next;
141 return result;
144 /* Get the edit distance between two nil-terminated strings. */
146 edit_distance_t
147 get_edit_distance (const char *s, const char *t)
149 return get_edit_distance (s, strlen (s), t, strlen (t));
152 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
153 an autovec of non-NULL strings, determine which element within
154 CANDIDATES has the lowest edit distance to TARGET. If there are
155 multiple elements with the same minimal distance, the first in the
156 vector wins.
158 If more than half of the letters were misspelled, the suggestion is
159 likely to be meaningless, so return NULL for this case. */
161 const char *
162 find_closest_string (const char *target,
163 const auto_vec<const char *> *candidates)
165 gcc_assert (target);
166 gcc_assert (candidates);
168 int i;
169 const char *candidate;
170 best_match<const char *, const char *> bm (target);
171 FOR_EACH_VEC_ELT (*candidates, i, candidate)
173 gcc_assert (candidate);
174 bm.consider (candidate);
177 return bm.get_best_meaningful_candidate ();
180 /* Generate the maximum edit distance for which we consider a suggestion
181 to be meaningful, given a goal of length GOAL_LEN and a candidate of
182 length CANDIDATE_LEN.
184 This is a third of the length of the candidate or of the goal,
185 whichever is bigger. */
187 edit_distance_t
188 get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
190 size_t max_length = MAX (goal_len, candidate_len);
191 size_t min_length = MIN (goal_len, candidate_len);
193 gcc_assert (max_length >= min_length);
195 /* Special case: don't offer suggestions for a pair of
196 length == 1 strings (or empty strings). */
197 if (max_length <= 1)
198 return 0;
200 /* If the lengths are close, then round down. */
201 if (max_length - min_length <= 1)
202 /* ...but allow an edit distance of at least 1. */
203 return BASE_COST * MAX (max_length / 3, 1);
205 /* Otherwise, round up (thus giving a little extra leeway to some cases
206 involving insertions/deletions). */
207 return BASE_COST * (max_length + 2) / 3;
210 #if CHECKING_P
212 namespace selftest {
214 /* Selftests. */
216 /* Verify that get_edit_distance (A, B) equals the expected value. */
218 static void
219 test_get_edit_distance_one_way (const char *a, const char *b,
220 edit_distance_t expected)
222 edit_distance_t actual = get_edit_distance (a, b);
223 ASSERT_EQ (actual, expected);
226 /* Verify that both
227 get_edit_distance (A, B)
229 get_edit_distance (B, A)
230 equal the expected value, to ensure that the function is symmetric. */
232 static void
233 test_get_edit_distance_both_ways (const char *a, const char *b,
234 edit_distance_t expected)
236 test_get_edit_distance_one_way (a, b, expected);
237 test_get_edit_distance_one_way (b, a, expected);
240 /* Verify get_edit_distance for a variety of pairs of pre-canned
241 inputs, comparing against known-good values. */
243 static void
244 test_edit_distances ()
246 test_get_edit_distance_both_ways ("", "nonempty",
247 BASE_COST * strlen ("nonempty"));
248 test_get_edit_distance_both_ways ("saturday", "sunday",
249 BASE_COST * 3);
250 test_get_edit_distance_both_ways ("foo", "m_foo", BASE_COST * 2);
251 test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 4);
252 test_get_edit_distance_both_ways
253 ("the quick brown fox jumps over the lazy dog", "dog", BASE_COST * 40);
254 test_get_edit_distance_both_ways
255 ("the quick brown fox jumps over the lazy dog",
256 "the quick brown dog jumps over the lazy fox",
257 BASE_COST * 4);
258 test_get_edit_distance_both_ways
259 ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
260 "All your base are belong to us",
261 BASE_COST * 44);
262 test_get_edit_distance_both_ways ("foo", "FOO", 3);
263 test_get_edit_distance_both_ways ("fee", "deed", BASE_COST * 2);
264 test_get_edit_distance_both_ways ("coorzd1", "coordx1", BASE_COST * 2);
265 test_get_edit_distance_both_ways ("assert", "sqrt", BASE_COST * 3);
266 test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", BASE_COST * 3);
267 test_get_edit_distance_both_ways ("time", "nice", BASE_COST * 2);
268 test_get_edit_distance_both_ways ("bar", "carg", BASE_COST * 2);
269 test_get_edit_distance_both_ways ("gtk_widget_show_all",
270 "GtkWidgetShowAll", 10);
271 test_get_edit_distance_both_ways ("m_bar", "bar", BASE_COST * 2);
272 test_get_edit_distance_both_ways ("MACRO", "MACRAME", BASE_COST * 3);
273 test_get_edit_distance_both_ways ("ab", "ac", BASE_COST * 1);
274 test_get_edit_distance_both_ways ("ab", "a", BASE_COST * 1);
275 test_get_edit_distance_both_ways ("a", "b", BASE_COST * 1);
276 test_get_edit_distance_both_ways ("nanl", "name", BASE_COST * 2);
277 test_get_edit_distance_both_ways ("char", "bar", BASE_COST * 2);
278 test_get_edit_distance_both_ways ("-optimize", "fsanitize", BASE_COST * 5);
279 test_get_edit_distance_both_ways ("__DATE__", "__i386__", BASE_COST * 4);
281 /* Examples where transposition helps. */
282 test_get_edit_distance_both_ways ("ab", "ba", BASE_COST * 1);
283 test_get_edit_distance_both_ways ("ba", "abc", BASE_COST * 2);
284 test_get_edit_distance_both_ways ("coorzd1", "coordz1", BASE_COST * 1);
285 test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
286 "bacdefghijklmnopqrstuvwxzy",
287 BASE_COST * 2);
288 test_get_edit_distance_both_ways ("saturday", "sundya", BASE_COST * 4);
289 test_get_edit_distance_both_ways ("signed", "singed", BASE_COST * 1);
292 /* Subroutine of test_get_edit_distance_cutoff, for emulating the
293 spellchecking cutoff in up to GCC 8. */
295 static edit_distance_t
296 get_old_cutoff (size_t goal_len, size_t candidate_len)
298 return BASE_COST * MAX (goal_len, candidate_len) / 2;
301 /* Verify that the cutoff for "meaningfulness" of suggestions is at least as
302 conservative as in older GCC releases.
304 This should ensure that we don't offer additional meaningless
305 suggestions (apart from those for which transposition has helped). */
307 static void
308 test_get_edit_distance_cutoff ()
310 for (size_t goal_len = 0; goal_len < 30; goal_len++)
311 for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
312 ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
313 <= get_old_cutoff (goal_len, candidate_len));
316 /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
318 static void
319 assert_suggested_for (const location &loc, const char *candidate,
320 const char *target)
322 auto_vec<const char *> candidates;
323 candidates.safe_push (candidate);
324 ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
327 /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
329 #define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET) \
330 SELFTEST_BEGIN_STMT \
331 assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
332 SELFTEST_END_STMT
334 /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
336 static void
337 assert_not_suggested_for (const location &loc, const char *candidate,
338 const char *target)
340 auto_vec<const char *> candidates;
341 candidates.safe_push (candidate);
342 ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
345 /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
347 #define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET) \
348 SELFTEST_BEGIN_STMT \
349 assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
350 SELFTEST_END_STMT
352 /* Verify that we offer varous suggestions that are meaningful,
353 and that we don't offer various other ones that aren't (PR c/82967). */
355 static void
356 test_suggestions ()
358 /* Good suggestions. */
360 ASSERT_SUGGESTED_FOR ("m_bar", "bar");
361 // dist == 2, max_length == 5, min_length == 3
363 ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
364 // dist == 3, max_length == 7, min_length == 5
366 ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
367 // dist == 7, max_length == 16, min_length = 19
369 ASSERT_SUGGESTED_FOR ("ab", "ac");
370 // dist == 1, max_length == min_length = 2
372 ASSERT_SUGGESTED_FOR ("ab", "a");
373 // dist == 1, max_length == 2, min_length = 1
375 /* Bad suggestions. */
377 ASSERT_NOT_SUGGESTED_FOR ("a", "b");
378 // dist == 1, max_length == min_length = 1
380 ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
381 // dist == 3, max_length 6, min_length == 4
383 ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
384 // dist == 3, max_length == min_length == 8
386 ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
387 ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
388 // dist == 2, max_length == min_length == 4
390 ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
391 ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
392 // dist == 2, max_length == 4, min_length == 3
394 ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
395 // dist == 5, max_length == min_length == 9
397 ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
398 // dist == 4, max_length == min_length == 8
400 ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
401 // dist == 9, max_length == 18, min_length == 11
404 /* Verify that find_closest_string is sane. */
406 static void
407 test_find_closest_string ()
409 auto_vec<const char *> candidates;
411 /* Verify that it can handle an empty vec. */
412 ASSERT_EQ (NULL, find_closest_string ("", &candidates));
414 /* Verify that it works sanely for non-empty vecs. */
415 candidates.safe_push ("apple");
416 candidates.safe_push ("banana");
417 candidates.safe_push ("cherry");
419 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
420 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
421 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
422 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
424 /* The order of the vec can matter, but it should not matter for these
425 inputs. */
426 candidates.truncate (0);
427 candidates.safe_push ("cherry");
428 candidates.safe_push ("banana");
429 candidates.safe_push ("apple");
430 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
431 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
432 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
433 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
435 /* If the goal string somehow makes it into the candidate list, offering
436 it as a suggestion will be nonsensical. Verify that we don't offer such
437 suggestions. */
438 ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
440 /* Example from PR 69968 where transposition helps. */
441 candidates.truncate (0);
442 candidates.safe_push("coordx");
443 candidates.safe_push("coordy");
444 candidates.safe_push("coordz");
445 candidates.safe_push("coordx1");
446 candidates.safe_push("coordy1");
447 candidates.safe_push("coordz1");
448 ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
450 candidates.truncate (0);
451 candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
452 candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
453 candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
454 ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
455 find_closest_string ("DWARF_GNAT_ENCODINGS_all",
456 &candidates));
458 /* The same as the previous test, but with a different order of
459 candidates. */
460 candidates.truncate (0);
461 candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
462 candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
463 candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
464 ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
465 find_closest_string ("DWARF_GNAT_ENCODINGS_all",
466 &candidates));
469 /* Test data for test_metric_conditions. */
471 static const char * const test_data[] = {
473 "foo",
474 "food",
475 "boo",
476 "1234567890123456789012345678901234567890123456789012345678901234567890",
477 "abc",
478 "ac",
479 "ca",
482 /* Verify that get_edit_distance appears to be a sane distance function,
483 even though it doesn't satisfy the conditions for being a metric. (This
484 is because the triangle inequality fails to hold: the distance between
485 "ca" and "ac" is 1, and so is the distance between "abc" and "ac", but
486 the distance between "abc" and "ca" is 3. Algorithms that calculate the
487 true Levenshtein-Damerau metric are much more expensive.) */
489 static void
490 test_metric_conditions ()
492 const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
494 for (int i = 0; i < num_test_cases; i++)
496 for (int j = 0; j < num_test_cases; j++)
498 edit_distance_t dist_ij
499 = get_edit_distance (test_data[i], test_data[j]);
501 /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
502 if (i == j)
503 ASSERT_EQ (dist_ij, 0);
504 else
505 ASSERT_TRUE (dist_ij > 0);
507 /* Symmetry: d(i, j) == d(j, i). */
508 edit_distance_t dist_ji
509 = get_edit_distance (test_data[j], test_data[i]);
510 ASSERT_EQ (dist_ij, dist_ji);
515 /* Run all of the selftests within this file. */
517 void
518 spellcheck_c_tests ()
520 test_edit_distances ();
521 test_get_edit_distance_cutoff ();
522 test_suggestions ();
523 test_find_closest_string ();
524 test_metric_conditions ();
527 } // namespace selftest
529 #endif /* #if CHECKING_P */