1 /* Find near-matches for strings.
2 Copyright (C) 2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
22 #include "coretypes.h"
25 #include "spellcheck.h"
27 /* The Levenshtein distance is an "edit-distance": the minimal
28 number of one-character insertions, removals or substitutions
29 that are needed to change one string into another.
31 This implementation uses the Wagner-Fischer algorithm. */
34 levenshtein_distance (const char *s
, int len_s
,
35 const char *t
, int len_t
)
37 const bool debug
= false;
41 printf ("s: \"%s\" (len_s=%i)\n", s
, len_s
);
42 printf ("t: \"%s\" (len_t=%i)\n", t
, len_t
);
50 /* We effectively build a matrix where each (i, j) contains the
51 Levenshtein distance between the prefix strings s[0:j]
53 Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
54 we simply keep track of the last row, v0 and a new row, v1,
55 which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
56 in favor of two (len_s + 1) allocations. These could potentially be
57 statically-allocated if we impose a maximum length on the
58 strings of interest. */
59 edit_distance_t
*v0
= new edit_distance_t
[len_s
+ 1];
60 edit_distance_t
*v1
= new edit_distance_t
[len_s
+ 1];
62 /* The first row is for the case of an empty target string, which
63 we can reach by deleting every character in the source string. */
64 for (int i
= 0; i
< len_s
+ 1; i
++)
67 /* Build successive rows. */
68 for (int i
= 0; i
< len_t
; i
++)
72 printf ("i:%i v0 = ", i
);
73 for (int j
= 0; j
< len_s
+ 1; j
++)
74 printf ("%i ", v0
[j
]);
78 /* The initial column is for the case of an empty source string; we
79 can reach prefixes of the target string of length i
80 by inserting i characters. */
83 /* Build the rest of the row by considering neighbours to
84 the north, west and northwest. */
85 for (int j
= 0; j
< len_s
; j
++)
87 edit_distance_t cost
= (s
[j
] == t
[i
] ? 0 : 1);
88 edit_distance_t deletion
= v1
[j
] + 1;
89 edit_distance_t insertion
= v0
[j
+ 1] + 1;
90 edit_distance_t substitution
= v0
[j
] + cost
;
91 edit_distance_t cheapest
= MIN (deletion
, insertion
);
92 cheapest
= MIN (cheapest
, substitution
);
96 /* Prepare to move on to next row. */
97 for (int j
= 0; j
< len_s
+ 1; j
++)
103 printf ("final v1 = ");
104 for (int j
= 0; j
< len_s
+ 1; j
++)
105 printf ("%i ", v1
[j
]);
109 edit_distance_t result
= v1
[len_s
];
115 /* Calculate Levenshtein distance between two nil-terminated strings. */
118 levenshtein_distance (const char *s
, const char *t
)
120 return levenshtein_distance (s
, strlen (s
), t
, strlen (t
));