Moving more modules
[apertium.git] / trunk / apertium-tools / apertium-utils / cognate-indux / similarity.cpp
blob969e12a8eba0e8edc5fb90743b7c312d5d8cf355
1 /*
2 * Copyright (C) 2007 Francis Tyers
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 #include "similarity.h"
22 using namespace std;
25 * broadly speaking:
26 * Levenshtein Distance Algorithm: C++ Implementation
27 * by Anders Sewerin Johansen
31 int levenshtein_distance(wstring string1, wstring string2)
33 int n = string1.length();
34 int m = string2.length();
36 if(n == 0 || m == 0) {
37 return -1;
40 vector< vector<int> > matrix(n + 1);
42 for(int i = 0; i <= n; i++) {
43 matrix[i].resize(m + 1);
46 for(int i = 0; i <= n; i++) {
47 matrix[i][0] = i;
50 for(int j = 0; j <= m; j++) {
51 matrix[0][j] = j;
54 for (int i = 1; i <= n; i++) {
55 const wchar_t s1i = string1[i - 1];
56 for (int j = 1; j <= m; j++) {
57 const wchar_t s2j = string2[j - 1];
58 int cost;
59 if (s1i == s2j) {
60 cost = 0;
61 } else {
62 cost = 1;
64 const int above = matrix[i - 1][j];
65 const int left = matrix[i][j - 1];
66 const int diag = matrix[i - 1][j - 1];
67 const int cell = min(above + 1, min(left + 1, diag + cost));
69 matrix[i][j] = cell;
73 return matrix[n][m];
77 * dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
79 * as described in...
82 float dice_coefficient(wstring string1, wstring string2)
84 int overlap = 0;
86 vector<wstring> string1_bigrams;
87 vector<wstring> string2_bigrams;
89 vector<wstring>::iterator iter;
91 iter = string1_bigrams.begin();
92 for(unsigned int i = 0; i < (string1.length() - 1); i++) { // extract character bigrams from string1
93 iter = string1_bigrams.insert(iter, string1.substr(i, 2));
96 iter = string2_bigrams.begin();
97 for(unsigned int j = 0; j < (string2.length() - 1); j++) { // extract character bigrams from string2
98 iter = string2_bigrams.insert(iter, string2.substr(j, 2));
101 // calculate character bigram overlap between two strings
102 for(unsigned int z = 0; z < string1_bigrams.size(); z++) {
103 for(unsigned int x = 0; x < string2_bigrams.size(); x++) {
104 if(string1_bigrams.at(z) == string2_bigrams.at(x)) {
105 overlap++;
110 // calculate dice coefficient
111 int total = string1_bigrams.size() + string2_bigrams.size();
112 float dice = (float)(overlap * 2) / (float)total;
114 return dice;
118 * Same as dice_coefficient only with gappy bigrams.
121 float xdice_coefficient(wstring string1, wstring string2)
123 int overlap = 0;
125 vector<wstring> string1_bigrams;
126 vector<wstring> string2_bigrams;
128 vector<wstring>::iterator iter;
130 iter = string1_bigrams.begin();
131 for(unsigned int i = 0; i < (string1.length() - 1); i++) { // extract character bigrams from string1
132 iter = string1_bigrams.insert(iter, string1.substr(i, 2));
135 for(unsigned int h = 0; h < (string1.length() - 1); h++) { // extract trigrams -- for gappy matching.
136 iter = string1_bigrams.insert(iter, string1.substr(h, 3));
139 iter = string2_bigrams.begin();
140 for(unsigned int j = 0; j < (string2.length() - 1); j++) { // extract character bigrams from string2
141 iter = string2_bigrams.insert(iter, string2.substr(j, 2));
144 for(unsigned int l = 0; l < (string2.length() - 1); l++) { // extract trigrams
145 iter = string2_bigrams.insert(iter, string2.substr(l, 3));
148 // calculate character bigram overlap between two strings
149 for(unsigned int z = 0; z < string1_bigrams.size(); z++) {
150 for(unsigned int x = 0; x < string2_bigrams.size(); x++) {
151 if(string1_bigrams.at(z) == string2_bigrams.at(x)) {
152 overlap++; // bar:bar = 1
153 } else if(string1_bigrams.at(z)[0] == string2_bigrams.at(x)[0] && string1_bigrams.at(z)[2] == string2_bigrams.at(x)[2]) {
154 overlap++; // bar:bur = 1
155 } else if(string1_bigrams.at(z)[0] == string2_bigrams.at(x)[0] && string1_bigrams.at(z)[2] == string2_bigrams.at(x)[1]) {
156 overlap++; // bar:br = 1
161 // calculate dice coefficient
162 int total = string1_bigrams.size() + string2_bigrams.size();
163 float dice = (float)(overlap * 2) / (float)total;
165 return dice;
169 * xxdice, as xdice but includes adjustment based on
170 * bigram position offset.
173 float xxdice_coefficient(wstring string1, wstring string2)
175 float overlap = 0.0;
177 vector<wstring> string1_bigrams;
178 vector<wstring> string2_bigrams;
180 vector<wstring>::iterator iter;
182 iter = string1_bigrams.begin();
183 for(unsigned int i = 0; i < (string1.length() - 1); i++) { // extract character bigrams from string1
184 iter = string1_bigrams.insert(iter, string1.substr(i, 2));
187 for(unsigned int h = 0; h < (string1.length() - 1); h++) { // extract trigrams -- for gappy matching.
188 iter = string1_bigrams.insert(iter, string1.substr(h, 3));
191 iter = string2_bigrams.begin();
192 for(unsigned int j = 0; j < (string2.length() - 1); j++) { // extract character bigrams from string2
193 iter = string2_bigrams.insert(iter, string2.substr(j, 2));
196 for(unsigned int l = 0; l < (string2.length() - 1); l++) { // extract trigrams
197 iter = string2_bigrams.insert(iter, string2.substr(l, 3));
200 // calculate character bigram overlap between two strings
201 for(unsigned int z = 0; z < string1_bigrams.size(); z++) {
202 for(unsigned int x = 0; x < string2_bigrams.size(); x++) {
203 int idx1 = string1.find_first_of(string1_bigrams.at(z));
204 int idx2 = string2.find_first_of(string2_bigrams.at(x));
205 int offset = idx1 - idx2; // offset is the difference between the positions of the bigrams
206 float fiddle = (float)(2 / (1 + (offset * offset))); // calculate fiddle factor
207 if(string1_bigrams.at(z) == string2_bigrams.at(x)) {
208 overlap += fiddle; // bar:bar = 1
209 } else if(string1_bigrams.at(z)[0] == string2_bigrams.at(x)[0] && string1_bigrams.at(z)[2] == string2_bigrams.at(x)[2]) {
210 overlap += fiddle; // bar:bur = 1
211 } else if(string1_bigrams.at(z)[0] == string2_bigrams.at(x)[0] && string1_bigrams.at(z)[2] == string2_bigrams.at(x)[1]) {
212 overlap += fiddle; // bar:br = 1
217 // calculate dice coefficient
218 int total = string1_bigrams.size() + string2_bigrams.size();
219 float dice = (float)(overlap * 2) / (float)total;
221 return dice;
225 * This is a hack of a hack of a hack.
226 * Kind of works though.
229 int longest_common_subsequence(wstring string1, wstring string2)
231 unsigned int i, j;
233 unsigned int n = string1.length();
234 unsigned int m = string2.length();
236 if(n == 0 || m == 0) {
237 return -1;
240 vector< vector<int> > matrix(n + 2);
242 for(i = 0; i <= n+1; i++) {
243 matrix[i].resize(m + 2);
246 for(i = 0; i <= n+1; i++) {
247 matrix[i][0] = i;
250 for(j = 0; j <= m+1; j++) {
251 matrix[0][j] = j;
254 for(i = n+1; i >= 1; i--) {
255 for(j = m+1; j >= 1; j--) {
256 // wcout << L"n: " << n+1 << L" m: " << m+1 << L" i: " << i << L" j: " << j << endl;
257 if(i == n+1 || j == m+1) {
258 matrix[i][j] = 0;
259 } else if(string1.at(i-1) == string2.at(j-1)) {
260 matrix[i][j] = 1 + matrix[i + 1][j + 1];
261 } else {
262 matrix[i][j] = max(matrix[i + 1][j], matrix[i][j + 1]);
267 for(i = 0; i <= n+1; i++) { // print matrix
268 wcout << endl;
269 for(j = 0; j <= m+1; j++){
270 wcout << matrix[i][j] << " ";
273 wcout << L" = ";
275 return matrix[1][1];
278 float longest_common_subsequence_coefficient(wstring string1, wstring string2)
280 unsigned int n = string1.length();
281 unsigned int m = string2.length();
283 int lcs = longest_common_subsequence(string1, string2);
285 return (float)lcs / (float)((n + m) / 2);