[ci] Update macos jobs
[xapian.git] / xapian-core / api / editdistance.cc
blob3f71e3f6befaba09d3868bc2a0ae45792c2188d4
1 /** @file
2 * @brief Edit distance calculation algorithm.
4 * Based on that described in:
6 * "An extension of Ukkonen's enhanced dynamic programming ASM algorithm"
7 * by Hal Berghel, University of Arkansas
8 * and David Roach, Acxiom Corporation
10 * http://berghel.net/publications/asm/asm.php
12 /* Copyright (C) 2003 Richard Boulton
13 * Copyright (C) 2007,2008,2009,2017,2019,2020 Olly Betts
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
30 #include <config.h>
32 #include "editdistance.h"
34 #include "omassert.h"
35 #include "popcount.h"
37 #include <algorithm>
38 #include <climits>
39 #include <cstdlib>
40 #include <cstring>
42 using namespace std;
44 template<class Char>
45 struct edist_seq {
46 edist_seq(const Char* ptr_, int len_) : ptr(ptr_), len(len_) { }
47 const Char* ptr;
48 int len;
51 template<class Char>
52 class edist_state {
53 /// Don't allow assignment.
54 edist_state& operator=(const edist_state&) = delete;
56 /// Don't allow copying.
57 edist_state(const edist_state&) = delete;
59 edist_seq<Char> seq1;
60 edist_seq<Char> seq2;
62 /* Array of f(k,p) values, where f(k,p) = the largest index i such that
63 * d(i,j) = p and d(i,j) is on diagonal k.
64 * ie: f(k,p) = largest i s.t. d(i,k+i) = p
65 * Where: d(i,j) = edit distance between substrings of length i and j.
67 int* fkp;
68 int fkp_rows;
70 /* Maximum possible edit distance (this is referred to as ZERO_K in
71 * the algorithm description by Berghel and Roach). */
72 int maxdist;
74 int calc_index(int k, int p) const {
75 return k + maxdist + fkp_rows * (p + 1);
78 public:
79 edist_state(const Char* ptr1, int len1, const Char* ptr2, int len2,
80 int* fkp_)
81 : seq1(ptr1, len1), seq2(ptr2, len2), fkp(fkp_), maxdist(len2) {
82 Assert(len2 >= len1);
83 // fkp is stored as a rectangular array, column by column. Each entry
84 // represents a value of p, from -1 to maxdist or a special value
85 // close-ish to INT_MIN.
86 fkp_rows = 2 * maxdist + 1;
87 // It's significantly faster to memset() than std::fill_n() with an int
88 // value, so fill with the msb of INT_MIN, which for 32-bit 2's
89 // complement int means -2139062144 instead of -2147483648, which is
90 // fine what we need here.
91 memset(fkp, unsigned(INT_MIN) >> (8 * (sizeof(int) - 1)),
92 sizeof(int) * (calc_index(maxdist, maxdist - 2) + 1));
93 set_f_kp(0, -1, -1);
94 for (int k = 1; k <= maxdist; ++k) {
95 set_f_kp(k, k - 1, -1);
96 set_f_kp(-k, k - 1, k - 1);
100 int get_f_kp(int k, int p) const {
101 return fkp[calc_index(k, p)];
104 void set_f_kp(int k, int p, int val) {
105 fkp[calc_index(k, p)] = val;
108 bool is_transposed(int pos1, int pos2) const {
109 if (pos1 <= 0 || pos2 <= 0 || pos1 >= seq1.len || pos2 >= seq2.len)
110 return false;
111 return (seq1.ptr[pos1 - 1] == seq2.ptr[pos2] &&
112 seq1.ptr[pos1] == seq2.ptr[pos2 - 1]);
115 void edist_calc_f_kp(int k, int p);
118 template<class Char>
119 void edist_state<Char>::edist_calc_f_kp(int k, int p)
121 int maxlen = get_f_kp(k, p - 1) + 1; /* dist if do substitute */
122 int maxlen2 = get_f_kp(k - 1, p - 1); /* dist if do insert */
123 int maxlen3 = get_f_kp(k + 1, p - 1) + 1; /* dist if delete */
125 if (is_transposed(maxlen, maxlen + k)) {
126 // Transposition.
127 ++maxlen;
130 if (maxlen >= maxlen2) {
131 if (maxlen >= maxlen3) {
132 // Transposition or Substitution.
133 } else {
134 // Deletion.
135 maxlen = maxlen3;
137 } else {
138 if (maxlen2 >= maxlen3) {
139 // Insertion.
140 maxlen = maxlen2;
141 } else {
142 // Deletion.
143 maxlen = maxlen3;
147 /* Check for exact matches, and increase the length until we don't have
148 * one. */
149 while (maxlen < seq1.len &&
150 maxlen + k < seq2.len &&
151 seq1.ptr[maxlen] == seq2.ptr[maxlen + k]) {
152 ++maxlen;
154 set_f_kp(k, p, maxlen);
157 template<class Char>
158 static int
159 seqcmp_editdist(const Char* ptr1, int len1, const Char* ptr2, int len2,
160 int* fkp_, int max_distance)
162 int lendiff = len2 - len1;
163 /* Make sure second sequence is longer (or same length). */
164 if (lendiff < 0) {
165 lendiff = -lendiff;
166 swap(ptr1, ptr2);
167 swap(len1, len2);
170 /* Special case for if one or both sequences are empty. */
171 if (len1 == 0) return len2;
173 edist_state<Char> state(ptr1, len1, ptr2, len2, fkp_);
175 int p = lendiff; /* This is the minimum possible edit distance. */
176 while (p <= max_distance) {
177 for (int temp_p = 0; temp_p != p; ++temp_p) {
178 int inc = p - temp_p;
179 if (abs(lendiff - inc) <= temp_p) {
180 state.edist_calc_f_kp(lendiff - inc, temp_p);
182 if (abs(lendiff + inc) <= temp_p) {
183 state.edist_calc_f_kp(lendiff + inc, temp_p);
186 state.edist_calc_f_kp(lendiff, p);
188 if (state.get_f_kp(lendiff, p) == len1) break;
189 ++p;
192 return p;
196 EditDistanceCalculator::calc(const unsigned* ptr, int len,
197 int max_distance) const
199 // Calculate a cheap lower bound on the edit distance by considering
200 // frequency histograms.
201 freqs_bitmap freqs = 0;
202 for (int i = 0; i != len; ++i) {
203 unsigned ch = ptr[i];
204 freqs |= freqs_bitmap(1) << (ch & FREQS_MASK);
206 // Each insertion or deletion adds at most 1 to total. Each transposition
207 // doesn't change it at all. But each substitution can change it by 2 so
208 // we need to divide it by 2. We round up since the unpaired change must
209 // be due to an actual edit.
210 unsigned bits = 1;
211 add_popcount(bits, freqs ^ target_freqs);
212 int ed_lower_bound = bits / 2;
213 if (ed_lower_bound > max_distance) {
214 // It's OK to return any distance > max_distance if the true answer is
215 // > max_distance.
216 return ed_lower_bound;
219 if (!array) {
220 // Allocate space for the largest case we need to consider, which is
221 // when the second sequence is len + max_distance long. Any second
222 // sequence which is longer must be more than max_distance edits
223 // away.
224 int maxdist = target.size() + max_distance;
225 int max_cols = maxdist * 2;
226 int max_rows = maxdist * 2 + 1;
227 array = new int[max_rows * max_cols];
230 return seqcmp_editdist<unsigned>(ptr, len, &target[0], target.size(),
231 array, max_distance);