1 /** @file editdistance.cc
2 * @brief Edit distance calculation algorithm.
4 * Based on that described in:
6 * "An extension of Ukkonen's enhanced dynamic programming ASM algorithm"
7 * by Hal Berghel, University of Arkansas
8 * and David Roach, Acxiom Corporation
10 * http://berghel.net/publications/asm/asm.php
12 /* Copyright (C) 2003 Richard Boulton
13 * Copyright (C) 2007,2008,2009,2017 Olly Betts
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "editdistance.h"
44 edist_seq(const CHR
* ptr_
, int len_
) : ptr(ptr_
), len(len_
) { }
51 /// Don't allow assignment.
52 void operator=(const edist_state
&);
54 /// Don't allow copying.
55 edist_state(const edist_state
&);
60 /* Array of f(k,p) values, where f(k,p) = the largest index i such that
61 * d(i,j) = p and d(i,j) is on diagonal k.
62 * ie: f(k,p) = largest i s.t. d(i,k+i) = p
63 * Where: d(i,j) = edit distance between substrings of length i and j.
68 /* Maximum possible edit distance (this is referred to as ZERO_K in
69 * the algorithm description by Berghel and Roach). */
72 int calc_index(int k
, int p
) const {
73 return (k
+ maxdist
) * fkp_cols
+ p
+ 1;
78 edist_state(const CHR
* ptr1
, int len1
, const CHR
* ptr2
, int len2
);
82 int get_f_kp(int k
, int p
) const {
83 return fkp
[calc_index(k
, p
)];
86 void set_f_kp(int k
, int p
, int val
) {
87 fkp
[calc_index(k
, p
)] = val
;
90 bool is_transposed(int pos1
, int pos2
) const {
91 if (pos1
<= 0 || pos2
<= 0 || pos1
>= seq1
.len
|| pos2
>= seq2
.len
) return false;
92 return (seq1
.ptr
[pos1
- 1] == seq2
.ptr
[pos2
] &&
93 seq1
.ptr
[pos1
] == seq2
.ptr
[pos2
- 1]);
96 void edist_calc_f_kp(int k
, int p
);
100 void edist_state
<CHR
>::edist_calc_f_kp(int k
, int p
)
102 int maxlen
= get_f_kp(k
, p
- 1) + 1; /* dist if do substitute */
103 int maxlen2
= get_f_kp(k
- 1, p
- 1); /* dist if do insert */
104 int maxlen3
= get_f_kp(k
+ 1, p
- 1) + 1; /* dist if delete */
106 if (is_transposed(maxlen
, maxlen
+ k
)) {
111 if (maxlen
>= maxlen2
) {
112 if (maxlen
>= maxlen3
) {
113 // Transposition or Substitution.
119 if (maxlen2
>= maxlen3
) {
128 /* Check for exact matches, and increase the length until we don't have
130 while (maxlen
< seq1
.len
&&
131 maxlen
+ k
< seq2
.len
&&
132 seq1
.ptr
[maxlen
] == seq2
.ptr
[maxlen
+ k
]) {
135 set_f_kp(k
, p
, maxlen
);
140 edist_state
<CHR
>::edist_state(const CHR
* ptr1
, int len1
,
141 const CHR
* ptr2
, int len2
)
142 : seq1(ptr1
, len1
), seq2(ptr2
, len2
), maxdist(len2
)
144 Assert(len2
>= len1
);
145 /* Each row represents a value of k, from -maxdist to maxdist. */
146 int fkp_rows
= maxdist
* 2 + 1;
147 /* Each column represents a value of p, from -1 to maxdist. */
148 fkp_cols
= maxdist
+ 2;
149 /* fkp is stored as a rectangular array, row by row. */
150 fkp
= new int[fkp_rows
* fkp_cols
];
152 for (int k
= -maxdist
; k
<= maxdist
; ++k
) {
153 for (int p
= -1; p
<= maxdist
; ++p
) {
154 if (p
== abs(k
) - 1) {
156 set_f_kp(k
, p
, abs(k
) - 1);
160 } else if (p
< abs(k
)) {
161 set_f_kp(k
, p
, -INF
);
168 edist_state
<CHR
>::~edist_state() {
174 seqcmp_editdist(const CHR
* ptr1
, int len1
, const CHR
* ptr2
, int len2
,
177 int lendiff
= len2
- len1
;
178 /* Make sure second sequence is longer (or same length). */
185 /* Special case for if one or both sequences are empty. */
186 if (len1
== 0) return len2
;
188 edist_state
<CHR
> state(ptr1
, len1
, ptr2
, len2
);
190 int p
= lendiff
; /* This is the minimum possible edit distance. */
191 while (p
<= max_distance
) {
192 for (int temp_p
= 0; temp_p
!= p
; ++temp_p
) {
193 int inc
= p
- temp_p
;
194 if (abs(lendiff
- inc
) <= temp_p
) {
195 state
.edist_calc_f_kp(lendiff
- inc
, temp_p
);
197 if (abs(lendiff
+ inc
) <= temp_p
) {
198 state
.edist_calc_f_kp(lendiff
+ inc
, temp_p
);
201 state
.edist_calc_f_kp(lendiff
, p
);
203 if (state
.get_f_kp(lendiff
, p
) == len1
) break;
211 edit_distance_unsigned(const unsigned * ptr1
, int len1
,
212 const unsigned * ptr2
, int len2
,
215 return seqcmp_editdist
<unsigned>(ptr1
, len1
, ptr2
, len2
, max_distance
);
218 // We sum the character frequency histogram absolute differences to compute a
219 // lower bound on the edit distance. Rather than counting each Unicode code
220 // point uniquely, we use an array with VEC_SIZE elements and tally code points
221 // modulo VEC_SIZE which can only reduce the bound we calculate.
223 // There will be a trade-off between how good the bound is and how large and
224 // array is used (a larger array takes more time to clear and sum over). The
225 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite
226 // but that may not reflect real world performance. FIXME: profile and tune.
231 freq_edit_lower_bound(const vector
<unsigned> & a
, const vector
<unsigned> & b
)
234 memset(vec
, 0, sizeof(vec
));
235 vector
<unsigned>::const_iterator i
;
236 for (i
= a
.begin(); i
!= a
.end(); ++i
) {
237 ++vec
[(*i
) % VEC_SIZE
];
239 for (i
= b
.begin(); i
!= b
.end(); ++i
) {
240 --vec
[(*i
) % VEC_SIZE
];
242 unsigned int total
= 0;
243 for (size_t j
= 0; j
< VEC_SIZE
; ++j
) {
244 total
+= abs(vec
[j
]);
246 // Each insertion or deletion adds at most 1 to total. Each transposition
247 // doesn't change it at all. But each substitution can change it by 2 so
248 // we need to divide it by 2. Rounding up is OK, since the odd change must
249 // be due to an actual edit.
250 return (total
+ 1) / 2;