2 * @brief Edit distance calculation algorithm.
4 * Based on that described in:
6 * "An extension of Ukkonen's enhanced dynamic programming ASM algorithm"
7 * by Hal Berghel, University of Arkansas
8 * and David Roach, Acxiom Corporation
10 * http://berghel.net/publications/asm/asm.php
12 /* Copyright (C) 2003 Richard Boulton
13 * Copyright (C) 2007,2008,2009,2017,2019,2020 Olly Betts
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "editdistance.h"
46 edist_seq(const Char
* ptr_
, int len_
) : ptr(ptr_
), len(len_
) { }
53 /// Don't allow assignment.
54 edist_state
& operator=(const edist_state
&) = delete;
56 /// Don't allow copying.
57 edist_state(const edist_state
&) = delete;
62 /* Array of f(k,p) values, where f(k,p) = the largest index i such that
63 * d(i,j) = p and d(i,j) is on diagonal k.
64 * ie: f(k,p) = largest i s.t. d(i,k+i) = p
65 * Where: d(i,j) = edit distance between substrings of length i and j.
70 /* Maximum possible edit distance (this is referred to as ZERO_K in
71 * the algorithm description by Berghel and Roach). */
74 int calc_index(int k
, int p
) const {
75 return k
+ maxdist
+ fkp_rows
* (p
+ 1);
79 edist_state(const Char
* ptr1
, int len1
, const Char
* ptr2
, int len2
,
81 : seq1(ptr1
, len1
), seq2(ptr2
, len2
), fkp(fkp_
), maxdist(len2
) {
83 // fkp is stored as a rectangular array, column by column. Each entry
84 // represents a value of p, from -1 to maxdist or a special value
85 // close-ish to INT_MIN.
86 fkp_rows
= 2 * maxdist
+ 1;
87 // It's significantly faster to memset() than std::fill_n() with an int
88 // value, so fill with the msb of INT_MIN, which for 32-bit 2's
89 // complement int means -2139062144 instead of -2147483648, which is
90 // fine what we need here.
91 memset(fkp
, unsigned(INT_MIN
) >> (8 * (sizeof(int) - 1)),
92 sizeof(int) * (calc_index(maxdist
, maxdist
- 2) + 1));
94 for (int k
= 1; k
<= maxdist
; ++k
) {
95 set_f_kp(k
, k
- 1, -1);
96 set_f_kp(-k
, k
- 1, k
- 1);
100 int get_f_kp(int k
, int p
) const {
101 return fkp
[calc_index(k
, p
)];
104 void set_f_kp(int k
, int p
, int val
) {
105 fkp
[calc_index(k
, p
)] = val
;
108 bool is_transposed(int pos1
, int pos2
) const {
109 if (pos1
<= 0 || pos2
<= 0 || pos1
>= seq1
.len
|| pos2
>= seq2
.len
)
111 return (seq1
.ptr
[pos1
- 1] == seq2
.ptr
[pos2
] &&
112 seq1
.ptr
[pos1
] == seq2
.ptr
[pos2
- 1]);
115 void edist_calc_f_kp(int k
, int p
);
119 void edist_state
<Char
>::edist_calc_f_kp(int k
, int p
)
121 int maxlen
= get_f_kp(k
, p
- 1) + 1; /* dist if do substitute */
122 int maxlen2
= get_f_kp(k
- 1, p
- 1); /* dist if do insert */
123 int maxlen3
= get_f_kp(k
+ 1, p
- 1) + 1; /* dist if delete */
125 if (is_transposed(maxlen
, maxlen
+ k
)) {
130 if (maxlen
>= maxlen2
) {
131 if (maxlen
>= maxlen3
) {
132 // Transposition or Substitution.
138 if (maxlen2
>= maxlen3
) {
147 /* Check for exact matches, and increase the length until we don't have
149 while (maxlen
< seq1
.len
&&
150 maxlen
+ k
< seq2
.len
&&
151 seq1
.ptr
[maxlen
] == seq2
.ptr
[maxlen
+ k
]) {
154 set_f_kp(k
, p
, maxlen
);
159 seqcmp_editdist(const Char
* ptr1
, int len1
, const Char
* ptr2
, int len2
,
160 int* fkp_
, int max_distance
)
162 int lendiff
= len2
- len1
;
163 /* Make sure second sequence is longer (or same length). */
170 /* Special case for if one or both sequences are empty. */
171 if (len1
== 0) return len2
;
173 edist_state
<Char
> state(ptr1
, len1
, ptr2
, len2
, fkp_
);
175 int p
= lendiff
; /* This is the minimum possible edit distance. */
176 while (p
<= max_distance
) {
177 for (int temp_p
= 0; temp_p
!= p
; ++temp_p
) {
178 int inc
= p
- temp_p
;
179 if (abs(lendiff
- inc
) <= temp_p
) {
180 state
.edist_calc_f_kp(lendiff
- inc
, temp_p
);
182 if (abs(lendiff
+ inc
) <= temp_p
) {
183 state
.edist_calc_f_kp(lendiff
+ inc
, temp_p
);
186 state
.edist_calc_f_kp(lendiff
, p
);
188 if (state
.get_f_kp(lendiff
, p
) == len1
) break;
196 EditDistanceCalculator::calc(const unsigned* ptr
, int len
,
197 int max_distance
) const
199 // Calculate a cheap lower bound on the edit distance by considering
200 // frequency histograms.
201 freqs_bitmap freqs
= 0;
202 for (int i
= 0; i
!= len
; ++i
) {
203 unsigned ch
= ptr
[i
];
204 freqs
|= freqs_bitmap(1) << (ch
& FREQS_MASK
);
206 // Each insertion or deletion adds at most 1 to total. Each transposition
207 // doesn't change it at all. But each substitution can change it by 2 so
208 // we need to divide it by 2. We round up since the unpaired change must
209 // be due to an actual edit.
211 add_popcount(bits
, freqs
^ target_freqs
);
212 int ed_lower_bound
= bits
/ 2;
213 if (ed_lower_bound
> max_distance
) {
214 // It's OK to return any distance > max_distance if the true answer is
216 return ed_lower_bound
;
220 // Allocate space for the largest case we need to consider, which is
221 // when the second sequence is len + max_distance long. Any second
222 // sequence which is longer must be more than max_distance edits
224 int maxdist
= target
.size() + max_distance
;
225 int max_cols
= maxdist
* 2;
226 int max_rows
= maxdist
* 2 + 1;
227 array
= new int[max_rows
* max_cols
];
230 return seqcmp_editdist
<unsigned>(ptr
, len
, &target
[0], target
.size(),
231 array
, max_distance
);