2016-01-26 Paul Thomas <pault@gcc.gnu.org>
[official-gcc.git] / gcc / spellcheck.c
blobe641a56bc023ae37d538172b46115957dc9abb44
1 /* Find near-matches for strings.
2 Copyright (C) 2015-2016 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "spellcheck.h"
27 /* The Levenshtein distance is an "edit-distance": the minimal
28 number of one-character insertions, removals or substitutions
29 that are needed to change one string into another.
31 This implementation uses the Wagner-Fischer algorithm. */
33 edit_distance_t
34 levenshtein_distance (const char *s, int len_s,
35 const char *t, int len_t)
37 const bool debug = false;
39 if (debug)
41 printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
42 printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
45 if (len_s == 0)
46 return len_t;
47 if (len_t == 0)
48 return len_s;
50 /* We effectively build a matrix where each (i, j) contains the
51 Levenshtein distance between the prefix strings s[0:j]
52 and t[0:i].
53 Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
54 we simply keep track of the last row, v0 and a new row, v1,
55 which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
56 in favor of two (len_s + 1) allocations. These could potentially be
57 statically-allocated if we impose a maximum length on the
58 strings of interest. */
59 edit_distance_t *v0 = new edit_distance_t[len_s + 1];
60 edit_distance_t *v1 = new edit_distance_t[len_s + 1];
62 /* The first row is for the case of an empty target string, which
63 we can reach by deleting every character in the source string. */
64 for (int i = 0; i < len_s + 1; i++)
65 v0[i] = i;
67 /* Build successive rows. */
68 for (int i = 0; i < len_t; i++)
70 if (debug)
72 printf ("i:%i v0 = ", i);
73 for (int j = 0; j < len_s + 1; j++)
74 printf ("%i ", v0[j]);
75 printf ("\n");
78 /* The initial column is for the case of an empty source string; we
79 can reach prefixes of the target string of length i
80 by inserting i characters. */
81 v1[0] = i + 1;
83 /* Build the rest of the row by considering neighbours to
84 the north, west and northwest. */
85 for (int j = 0; j < len_s; j++)
87 edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
88 edit_distance_t deletion = v1[j] + 1;
89 edit_distance_t insertion = v0[j + 1] + 1;
90 edit_distance_t substitution = v0[j] + cost;
91 edit_distance_t cheapest = MIN (deletion, insertion);
92 cheapest = MIN (cheapest, substitution);
93 v1[j + 1] = cheapest;
96 /* Prepare to move on to next row. */
97 for (int j = 0; j < len_s + 1; j++)
98 v0[j] = v1[j];
101 if (debug)
103 printf ("final v1 = ");
104 for (int j = 0; j < len_s + 1; j++)
105 printf ("%i ", v1[j]);
106 printf ("\n");
109 edit_distance_t result = v1[len_s];
110 delete[] v0;
111 delete[] v1;
112 return result;
115 /* Calculate Levenshtein distance between two nil-terminated strings. */
117 edit_distance_t
118 levenshtein_distance (const char *s, const char *t)
120 return levenshtein_distance (s, strlen (s), t, strlen (t));