[python3] Simplify generated wrapper post-processing
[xapian.git] / xapian-core / api / editdistance.cc
blobfec54421abd0f8d9f290565106fd493e45707aa1
1 /** @file editdistance.cc
2 * @brief Edit distance calculation algorithm.
4 * Based on that described in:
6 * "An extension of Ukkonen's enhanced dynamic programming ASM algorithm"
7 * by Hal Berghel, University of Arkansas
8 * and David Roach, Acxiom Corporation
10 * http://berghel.net/publications/asm/asm.php
12 /* Copyright (C) 2003 Richard Boulton
13 * Copyright (C) 2007,2008,2009,2017 Olly Betts
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
30 #include <config.h>
32 #include "editdistance.h"
34 #include "omassert.h"
36 #include <algorithm>
37 #include <cstdlib>
38 #include <cstring>
40 using namespace std;
42 template<class CHR>
43 struct edist_seq {
44 edist_seq(const CHR * ptr_, int len_) : ptr(ptr_), len(len_) { }
45 const CHR * ptr;
46 int len;
49 template<class CHR>
50 class edist_state {
51 /// Don't allow assignment.
52 void operator=(const edist_state &);
54 /// Don't allow copying.
55 edist_state(const edist_state &);
57 edist_seq<CHR> seq1;
58 edist_seq<CHR> seq2;
60 /* Array of f(k,p) values, where f(k,p) = the largest index i such that
61 * d(i,j) = p and d(i,j) is on diagonal k.
62 * ie: f(k,p) = largest i s.t. d(i,k+i) = p
63 * Where: d(i,j) = edit distance between substrings of length i and j.
65 int * fkp;
66 int fkp_cols;
68 /* Maximum possible edit distance (this is referred to as ZERO_K in
69 * the algorithm description by Berghel and Roach). */
70 int maxdist;
72 int calc_index(int k, int p) const {
73 return (k + maxdist) * fkp_cols + p + 1;
76 public:
78 edist_state(const CHR * ptr1, int len1, const CHR * ptr2, int len2);
80 ~edist_state();
82 int get_f_kp(int k, int p) const {
83 return fkp[calc_index(k, p)];
86 void set_f_kp(int k, int p, int val) {
87 fkp[calc_index(k, p)] = val;
90 bool is_transposed(int pos1, int pos2) const {
91 if (pos1 <= 0 || pos2 <= 0 || pos1 >= seq1.len || pos2 >= seq2.len) return false;
92 return (seq1.ptr[pos1 - 1] == seq2.ptr[pos2] &&
93 seq1.ptr[pos1] == seq2.ptr[pos2 - 1]);
96 void edist_calc_f_kp(int k, int p);
99 template<class CHR>
100 void edist_state<CHR>::edist_calc_f_kp(int k, int p)
102 int maxlen = get_f_kp(k, p - 1) + 1; /* dist if do substitute */
103 int maxlen2 = get_f_kp(k - 1, p - 1); /* dist if do insert */
104 int maxlen3 = get_f_kp(k + 1, p - 1) + 1; /* dist if delete */
106 if (is_transposed(maxlen, maxlen + k)) {
107 // Transposition.
108 ++maxlen;
111 if (maxlen >= maxlen2) {
112 if (maxlen >= maxlen3) {
113 // Transposition or Substitution.
114 } else {
115 // Deletion.
116 maxlen = maxlen3;
118 } else {
119 if (maxlen2 >= maxlen3) {
120 // Insertion.
121 maxlen = maxlen2;
122 } else {
123 // Deletion.
124 maxlen = maxlen3;
128 /* Check for exact matches, and increase the length until we don't have
129 * one. */
130 while (maxlen < seq1.len &&
131 maxlen + k < seq2.len &&
132 seq1.ptr[maxlen] == seq2.ptr[maxlen + k]) {
133 ++maxlen;
135 set_f_kp(k, p, maxlen);
138 #define INF 1000000
139 template<class CHR>
140 edist_state<CHR>::edist_state(const CHR * ptr1, int len1,
141 const CHR * ptr2, int len2)
142 : seq1(ptr1, len1), seq2(ptr2, len2), maxdist(len2)
144 Assert(len2 >= len1);
145 /* Each row represents a value of k, from -maxdist to maxdist. */
146 int fkp_rows = maxdist * 2 + 1;
147 /* Each column represents a value of p, from -1 to maxdist. */
148 fkp_cols = maxdist + 2;
149 /* fkp is stored as a rectangular array, row by row. */
150 fkp = new int[fkp_rows * fkp_cols];
152 for (int k = -maxdist; k <= maxdist; ++k) {
153 for (int p = -1; p <= maxdist; ++p) {
154 if (p == abs(k) - 1) {
155 if (k < 0) {
156 set_f_kp(k, p, abs(k) - 1);
157 } else {
158 set_f_kp(k, p, -1);
160 } else if (p < abs(k)) {
161 set_f_kp(k, p, -INF);
167 template<class CHR>
168 edist_state<CHR>::~edist_state() {
169 delete [] fkp;
172 template<class CHR>
173 static int
174 seqcmp_editdist(const CHR * ptr1, int len1, const CHR * ptr2, int len2,
175 int max_distance)
177 int lendiff = len2 - len1;
178 /* Make sure second sequence is longer (or same length). */
179 if (lendiff < 0) {
180 lendiff = -lendiff;
181 swap(ptr1, ptr2);
182 swap(len1, len2);
185 /* Special case for if one or both sequences are empty. */
186 if (len1 == 0) return len2;
188 edist_state<CHR> state(ptr1, len1, ptr2, len2);
190 int p = lendiff; /* This is the minimum possible edit distance. */
191 while (p <= max_distance) {
192 for (int temp_p = 0; temp_p != p; ++temp_p) {
193 int inc = p - temp_p;
194 if (abs(lendiff - inc) <= temp_p) {
195 state.edist_calc_f_kp(lendiff - inc, temp_p);
197 if (abs(lendiff + inc) <= temp_p) {
198 state.edist_calc_f_kp(lendiff + inc, temp_p);
201 state.edist_calc_f_kp(lendiff, p);
203 if (state.get_f_kp(lendiff, p) == len1) break;
204 ++p;
207 return p;
211 edit_distance_unsigned(const unsigned * ptr1, int len1,
212 const unsigned * ptr2, int len2,
213 int max_distance)
215 return seqcmp_editdist<unsigned>(ptr1, len1, ptr2, len2, max_distance);
218 // We sum the character frequency histogram absolute differences to compute a
219 // lower bound on the edit distance. Rather than counting each Unicode code
220 // point uniquely, we use an array with VEC_SIZE elements and tally code points
221 // modulo VEC_SIZE which can only reduce the bound we calculate.
223 // There will be a trade-off between how good the bound is and how large and
224 // array is used (a larger array takes more time to clear and sum over). The
225 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite
226 // but that may not reflect real world performance. FIXME: profile and tune.
228 #define VEC_SIZE 64
231 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)
233 int vec[VEC_SIZE];
234 memset(vec, 0, sizeof(vec));
235 vector<unsigned>::const_iterator i;
236 for (i = a.begin(); i != a.end(); ++i) {
237 ++vec[(*i) % VEC_SIZE];
239 for (i = b.begin(); i != b.end(); ++i) {
240 --vec[(*i) % VEC_SIZE];
242 unsigned int total = 0;
243 for (size_t j = 0; j < VEC_SIZE; ++j) {
244 total += abs(vec[j]);
246 // Each insertion or deletion adds at most 1 to total. Each transposition
247 // doesn't change it at all. But each substitution can change it by 2 so
248 // we need to divide it by 2. Rounding up is OK, since the odd change must
249 // be due to an actual edit.
250 return (total + 1) / 2;